From d2e1dec54ba18042622050b0b59f95d5b83f93c0 Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Mon, 19 May 2025 16:53:15 +0200 Subject: [PATCH] Fix Unicode matching for regex options The `Pattern` class needs to operate on ordinary "Unicode" Java string, whereas both option values and strings matched against the pattern are in Bazel's internal string encoding. This requires reencoding in the right places. Along the way, optimize literal regex matches - this was missed in 662c0f0ed15a7e27f9c9caf08fc60af0fd993ccb. --- .../devtools/build/lib/analysis/config/BUILD | 1 + .../config/ExecutionInfoModifier.java | 7 ++-- .../google/devtools/build/lib/events/BUILD | 1 + .../build/lib/events/OutputFilter.java | 25 ++++--------- .../devtools/build/lib/exec/local/BUILD | 1 + .../lib/exec/local/LocalSpawnRunner.java | 4 ++- .../build/lib/unsafe/StringUnsafe.java | 5 +-- .../devtools/build/lib/util/regex/BUILD | 3 ++ .../build/lib/util/regex/RegexUtil.java | 18 +++++++--- .../com/google/devtools/common/options/BUILD | 1 + .../devtools/common/options/Converters.java | 4 ++- .../common/options/RegexPatternOption.java | 8 ++++- .../devtools/build/lib/util/regex/BUILD | 3 +- .../lib/util/regex/RegexUtilFuzzTest.java | 12 +++---- .../build/lib/util/regex/RegexUtilTest.java | 2 ++ .../com/google/devtools/common/options/BUILD | 1 + .../options/RegexPatternConverterTest.java | 36 +++++++++++++++++++ 17 files changed, 94 insertions(+), 38 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/analysis/config/BUILD b/src/main/java/com/google/devtools/build/lib/analysis/config/BUILD index ca7296cd6b387f..9f6065063f6b7a 100644 --- a/src/main/java/com/google/devtools/build/lib/analysis/config/BUILD +++ b/src/main/java/com/google/devtools/build/lib/analysis/config/BUILD @@ -246,6 +246,7 @@ java_library( name = "execution_info_modifier", srcs = ["ExecutionInfoModifier.java"], deps = [ + "//src/main/java/com/google/devtools/build/lib/util:string_encoding", "//src/main/java/com/google/devtools/common/options", "//third_party:auto_value", "//third_party:guava", diff --git a/src/main/java/com/google/devtools/build/lib/analysis/config/ExecutionInfoModifier.java b/src/main/java/com/google/devtools/build/lib/analysis/config/ExecutionInfoModifier.java index 33894fb577614f..512022907229c8 100644 --- a/src/main/java/com/google/devtools/build/lib/analysis/config/ExecutionInfoModifier.java +++ b/src/main/java/com/google/devtools/build/lib/analysis/config/ExecutionInfoModifier.java @@ -14,6 +14,8 @@ package com.google.devtools.build.lib.analysis.config; +import static com.google.devtools.build.lib.util.StringEncoding.internalToUnicode; + import com.google.auto.value.AutoValue; import com.google.auto.value.extension.memoized.Memoized; import com.google.common.base.Splitter; @@ -102,7 +104,8 @@ private static ExecutionInfoModifier create(String input, ImmutableList expr.pattern().matcher(mnemonic).matches()); + return expressions().stream() + .anyMatch(expr -> expr.pattern().matcher(internalToUnicode(mnemonic)).matches()); } /** Checks whether the {@code executionInfoList} matches the {@code mnemonic}. */ @@ -139,7 +142,7 @@ public static void apply( /** Modifies the given map of {@code executionInfo} to add or remove the keys for this option. */ void apply(String mnemonic, Map executionInfo) { for (Expression expr : expressions()) { - if (expr.pattern().matcher(mnemonic).matches()) { + if (expr.pattern().matcher(internalToUnicode(mnemonic)).matches()) { if (expr.remove()) { executionInfo.remove(expr.key()); } else { diff --git a/src/main/java/com/google/devtools/build/lib/events/BUILD b/src/main/java/com/google/devtools/build/lib/events/BUILD index 8b805673b9fdf3..1c1679654668f3 100644 --- a/src/main/java/com/google/devtools/build/lib/events/BUILD +++ b/src/main/java/com/google/devtools/build/lib/events/BUILD @@ -15,6 +15,7 @@ java_library( name = "events", srcs = glob(["*.java"]), deps = [ + "//src/main/java/com/google/devtools/build/lib/util:string_encoding", "//src/main/java/com/google/devtools/build/lib/util/io:out-err", "//src/main/java/net/starlark/java/eval", "//src/main/java/net/starlark/java/syntax", diff --git a/src/main/java/com/google/devtools/build/lib/events/OutputFilter.java b/src/main/java/com/google/devtools/build/lib/events/OutputFilter.java index bea4ddf190ed26..3fd2b2042934de 100644 --- a/src/main/java/com/google/devtools/build/lib/events/OutputFilter.java +++ b/src/main/java/com/google/devtools/build/lib/events/OutputFilter.java @@ -14,32 +14,19 @@ package com.google.devtools.build.lib.events; +import com.google.devtools.build.lib.util.StringEncoding; import java.util.regex.Pattern; -/** - * An output filter for warnings. - */ +/** An output filter for warnings. */ public interface OutputFilter { /** An output filter that matches everything. */ - public static final OutputFilter OUTPUT_EVERYTHING = new OutputFilter() { - @Override - public boolean showOutput(String tag) { - return true; - } - }; + OutputFilter OUTPUT_EVERYTHING = tag -> true; /** An output filter that matches nothing. */ - public static final OutputFilter OUTPUT_NOTHING = new OutputFilter() { - @Override - public boolean showOutput(String tag) { - return false; - } - }; + OutputFilter OUTPUT_NOTHING = tag -> false; - /** - * Returns true iff the given tag matches the output filter. - */ + /** Returns true iff the given tag matches the output filter. */ boolean showOutput(String tag); /** An output filter using regular expression matching. */ @@ -62,7 +49,7 @@ private RegexOutputFilter(Pattern pattern) { @Override public boolean showOutput(String tag) { - return pattern.matcher(tag).find(); + return pattern.matcher(StringEncoding.internalToUnicode(tag)).find(); } @Override diff --git a/src/main/java/com/google/devtools/build/lib/exec/local/BUILD b/src/main/java/com/google/devtools/build/lib/exec/local/BUILD index 6a0756132f9c5c..fdcc87062a01c8 100644 --- a/src/main/java/com/google/devtools/build/lib/exec/local/BUILD +++ b/src/main/java/com/google/devtools/build/lib/exec/local/BUILD @@ -39,6 +39,7 @@ java_library( "//src/main/java/com/google/devtools/build/lib/shell", "//src/main/java/com/google/devtools/build/lib/util", "//src/main/java/com/google/devtools/build/lib/util:os", + "//src/main/java/com/google/devtools/build/lib/util:string_encoding", "//src/main/java/com/google/devtools/build/lib/util/io", "//src/main/java/com/google/devtools/build/lib/vfs", "//src/main/protobuf:failure_details_java_proto", diff --git a/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java index 042f3fbcb27990..0c677d6f17ff8c 100644 --- a/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java +++ b/src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java @@ -55,6 +55,7 @@ import com.google.devtools.build.lib.shell.SubprocessBuilder; import com.google.devtools.build.lib.shell.TerminationStatus; import com.google.devtools.build.lib.util.NetUtil; +import com.google.devtools.build.lib.util.StringEncoding; import com.google.devtools.build.lib.util.io.FileOutErr; import com.google.devtools.build.lib.vfs.Path; import com.google.errorprone.annotations.FormatMethod; @@ -322,7 +323,8 @@ private SpawnResult start() throws InterruptedException, ExecException, IOExcept ("Action type " + actionType + " is not allowed to run locally due to regex filter: " - + localExecutionOptions.allowedLocalAction.regexPattern() + + StringEncoding.unicodeToInternal( + localExecutionOptions.allowedLocalAction.regexPattern().toString()) + "\n") .getBytes(UTF_8)); spawnMetrics.setTotalTime(totalTimeStopwatch.elapsed()); diff --git a/src/main/java/com/google/devtools/build/lib/unsafe/StringUnsafe.java b/src/main/java/com/google/devtools/build/lib/unsafe/StringUnsafe.java index a4044e30142f76..1e42d7706dbc29 100644 --- a/src/main/java/com/google/devtools/build/lib/unsafe/StringUnsafe.java +++ b/src/main/java/com/google/devtools/build/lib/unsafe/StringUnsafe.java @@ -91,8 +91,9 @@ public static byte[] getInternalStringBytes(String obj) { // Truncation is ASCII only and thus doesn't change the encoding. String truncatedString = Ascii.truncate(obj, 1000, "..."); throw new IllegalArgumentException( - "Expected internal string with Latin-1 coder, got: %s (%s)" - .formatted(truncatedString, Arrays.toString(getByteArray(truncatedString)))); + String.format( + "Expected internal string with Latin-1 coder, got: %s (%s)", + truncatedString, Arrays.toString(getByteArray(truncatedString)))); } return getByteArray(obj); } diff --git a/src/main/java/com/google/devtools/build/lib/util/regex/BUILD b/src/main/java/com/google/devtools/build/lib/util/regex/BUILD index f6bbaca854a6db..6f58114741e772 100644 --- a/src/main/java/com/google/devtools/build/lib/util/regex/BUILD +++ b/src/main/java/com/google/devtools/build/lib/util/regex/BUILD @@ -14,4 +14,7 @@ filegroup( java_library( name = "regex_util", srcs = ["RegexUtil.java"], + deps = [ + "//src/main/java/com/google/devtools/build/lib/util:string_encoding", + ], ) diff --git a/src/main/java/com/google/devtools/build/lib/util/regex/RegexUtil.java b/src/main/java/com/google/devtools/build/lib/util/regex/RegexUtil.java index 349e021f0561cd..bcaa77a7143d9e 100644 --- a/src/main/java/com/google/devtools/build/lib/util/regex/RegexUtil.java +++ b/src/main/java/com/google/devtools/build/lib/util/regex/RegexUtil.java @@ -13,6 +13,9 @@ // limitations under the License. package com.google.devtools.build.lib.util.regex; +import static com.google.devtools.build.lib.util.StringEncoding.internalToUnicode; +import static com.google.devtools.build.lib.util.StringEncoding.unicodeToInternal; + import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -31,7 +34,8 @@ public final class RegexUtil { Pattern.compile("(?:\\^|\\\\A)*\\.\\*(?[^?+].*)?"); /** - * Returns a {@link Predicate} that matches the input string against the regex pattern with the + * Returns a {@link Predicate} that matches the input string (in internal encoding according to + * {@link com.google.devtools.build.lib.util.StringEncoding}) against the regex pattern with the * same semantics as {@link Matcher#matches()}, but more optimized and as if the pattern was * compiled with {@link Pattern#DOTALL}. */ @@ -43,7 +47,11 @@ public static Predicate asOptimizedMatchingPredicate(Pattern regexPatter // engine's optimization pass exploited below - it only applies when the first node is a // literal, not a group. // Unmatched \Q...\E sequences can have the same effect. - return s -> regexPattern.matcher(s).matches(); + return s -> regexPattern.matcher(internalToUnicode(s)).matches(); + } + // If the pattern matches a literal string, optimize to an equals call. + if (LITERAL_PATTERN_WITH_DOT_UNESCAPED.matcher(pattern).matches()) { + return unicodeToInternal(pattern.replace("\\.", "."))::equals; } // Recognize a pattern that starts with ".*" and drop it so that the engine sees the next part // of the pattern as the beginning and potentially optimizes the literal search. @@ -51,7 +59,7 @@ public static Predicate asOptimizedMatchingPredicate(Pattern regexPatter // determine if a ".*" suffix is safe to drop due to backslash escapes. Matcher suffixMatch = EXTRACT_SUFFIX_MATCH.matcher(pattern); if (!suffixMatch.matches()) { - return s -> regexPattern.matcher(s).matches(); + return s -> regexPattern.matcher(internalToUnicode(s)).matches(); } String suffixPattern = suffixMatch.group("suffix"); // A null suffixPattern implies the regex is equivalent to ".*", which matches any input string. @@ -61,14 +69,14 @@ public static Predicate asOptimizedMatchingPredicate(Pattern regexPatter // If the pattern matches a literal suffix, optimize to a string suffix // match, which is by far the fastest way to match. if (LITERAL_PATTERN_WITH_DOT_UNESCAPED.matcher(suffixPattern).matches()) { - String literalSuffix = suffixPattern.replace("\\.", "."); + String literalSuffix = unicodeToInternal(suffixPattern.replace("\\.", ".")); return s -> s.endsWith(literalSuffix); } // Turn the "match" pattern into an equivalent "find" pattern, since these are the only ones // that benefit from the Boyer-Moore optimization in the Java regex engine. // https://github.com/openjdk/jdk/blob/50dced88ff1aed23bb4c8fe9e4a08e6cc200b897/src/java.base/share/classes/java/util/regex/Pattern.java#L1959-L1969 Pattern compiled = Pattern.compile(suffixPattern + "\\z", Pattern.DOTALL); - return s -> compiled.matcher(s).find(); + return s -> compiled.matcher(internalToUnicode(s)).find(); } private RegexUtil() {} diff --git a/src/main/java/com/google/devtools/common/options/BUILD b/src/main/java/com/google/devtools/common/options/BUILD index 72975c2eab1056..3e8acf9d01f2af 100644 --- a/src/main/java/com/google/devtools/common/options/BUILD +++ b/src/main/java/com/google/devtools/common/options/BUILD @@ -50,6 +50,7 @@ java_library( ), deps = [ "//src/main/java/com/google/devtools/build/lib/util:pair", + "//src/main/java/com/google/devtools/build/lib/util:string_encoding", "//src/main/java/com/google/devtools/build/lib/util/regex:regex_util", "//src/main/java/net/starlark/java/spelling", "//third_party:auto_value", diff --git a/src/main/java/com/google/devtools/common/options/Converters.java b/src/main/java/com/google/devtools/common/options/Converters.java index d6092b9e3bea24..4bfc8c5279b727 100644 --- a/src/main/java/com/google/devtools/common/options/Converters.java +++ b/src/main/java/com/google/devtools/common/options/Converters.java @@ -22,6 +22,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; +import com.google.devtools.build.lib.util.StringEncoding; import java.time.Duration; import java.util.Iterator; import java.util.List; @@ -422,7 +423,8 @@ public static class RegexPatternConverter extends Converter.ContextlessNote: Strings passed to the {@link Pattern} and {@link java.util.regex.Matcher} API have to + * be converted to "Unicode" form first (see {@link + * com.google.devtools.build.lib.util.StringEncoding#internalToUnicode}. + */ public abstract Pattern regexPattern(); /** diff --git a/src/test/java/com/google/devtools/build/lib/util/regex/BUILD b/src/test/java/com/google/devtools/build/lib/util/regex/BUILD index 5f1af94552fa86..3d1ca90295f1a2 100644 --- a/src/test/java/com/google/devtools/build/lib/util/regex/BUILD +++ b/src/test/java/com/google/devtools/build/lib/util/regex/BUILD @@ -38,7 +38,7 @@ java_binary( ) # Start fuzzing via -# bazel run //src/test/java/com/google/devtools/build/lib/util:RegexUtilFuzzTest_run +# bazel run //src/test/java/com/google/devtools/build/lib/util/regex:RegexUtilFuzzTest_run java_fuzz_test( name = "RegexUtilFuzzTest", srcs = ["RegexUtilFuzzTest.java"], @@ -53,6 +53,7 @@ java_fuzz_test( "notap", ], deps = [ + "//src/main/java/com/google/devtools/build/lib/util:string_encoding", "//src/main/java/com/google/devtools/build/lib/util/regex:regex_util", "//third_party:jmh", ], diff --git a/src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilFuzzTest.java b/src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilFuzzTest.java index ecac6ba0f3beb4..cac2c4c521c624 100644 --- a/src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilFuzzTest.java +++ b/src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilFuzzTest.java @@ -13,6 +13,8 @@ // limitations under the License. package com.google.devtools.build.lib.util.regex; +import static com.google.devtools.build.lib.util.StringEncoding.unicodeToInternal; + import com.code_intelligence.jazzer.api.FuzzedDataProvider; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -35,7 +37,9 @@ public static void fuzzerTestOneInput(FuzzedDataProvider data) { } Predicate optimizedMatcher = RegexUtil.asOptimizedMatchingPredicate(originalPattern); - if (optimizedMatcher.test(haystack) != originalPattern.matcher(haystack).matches()) { + boolean originalMatches = originalPattern.matcher(haystack).matches(); + boolean optimizedMatches = optimizedMatcher.test(unicodeToInternal(haystack)); + if (originalMatches != optimizedMatches) { throw new AssertionError( """ Optimized matcher and original matcher differ in behavior: @@ -44,11 +48,7 @@ public static void fuzzerTestOneInput(FuzzedDataProvider data) { originalPattern.matcher(haystack).matches(): %s optimizedMatcher.test(haystack): %s """ - .formatted( - needle, - haystack, - originalPattern.matcher(haystack).matches(), - optimizedMatcher.test(haystack))); + .formatted(needle, haystack, originalMatches, optimizedMatches)); } } } diff --git a/src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilTest.java b/src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilTest.java index ae502dc0fabf1a..f6114b80c5c2e3 100644 --- a/src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilTest.java +++ b/src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilTest.java @@ -34,6 +34,7 @@ public void optimizedMatchingPredicate( "a", "foo", "foofoo", + "coverage.dat", "/coverage.dat", "/coverage.data", "/coverage1dat", @@ -55,6 +56,7 @@ public void optimizedMatchingPredicate( ".*?foo", ".*+foo", "^foo$", + "coverage\\.dat", ".*/coverage.dat", ".*/coverage\\.dat", ".*/test/.*/coverage\\.dat", diff --git a/src/test/java/com/google/devtools/common/options/BUILD b/src/test/java/com/google/devtools/common/options/BUILD index 598842cfbfc0a2..a67856fbb1bba3 100644 --- a/src/test/java/com/google/devtools/common/options/BUILD +++ b/src/test/java/com/google/devtools/common/options/BUILD @@ -43,6 +43,7 @@ java_test( deps = [ ":testutils", "//src/main/java/com/google/devtools/build/lib/util:classpath", + "//src/main/java/com/google/devtools/build/lib/util:string_encoding", "//src/main/java/com/google/devtools/common/options", "//src/main/java/com/google/devtools/common/options:invocation_policy", "//src/main/java/com/google/devtools/common/options/testing", diff --git a/src/test/java/com/google/devtools/common/options/RegexPatternConverterTest.java b/src/test/java/com/google/devtools/common/options/RegexPatternConverterTest.java index 8c6da91a7ecb9c..90c26c6b8a7967 100644 --- a/src/test/java/com/google/devtools/common/options/RegexPatternConverterTest.java +++ b/src/test/java/com/google/devtools/common/options/RegexPatternConverterTest.java @@ -16,6 +16,7 @@ import static com.google.common.truth.Truth.assertThat; import static org.junit.Assert.assertThrows; +import com.google.devtools.build.lib.util.StringEncoding; import com.google.devtools.common.options.Converters.RegexPatternConverter; import com.google.devtools.common.options.testing.ConverterTester; import com.google.testing.junit.testparameterinjector.TestParameterInjector; @@ -61,4 +62,39 @@ public void throwsForWrongPattern() { assertThrows(OptionsParsingException.class, () -> new RegexPatternConverter().convert("{")); assertThat(e).hasMessageThat().startsWith("Not a valid regular expression:"); } + + @Test + public void unicodeLiteral() throws OptionsParsingException { + // Options passed on the command line are passed to convertes in the internal encoding. + var regex = new RegexPatternConverter().convert(StringEncoding.unicodeToInternal("äöüÄÖÜß🌱")); + assertThat(regex.regexPattern().matcher("äöüÄÖÜß🌱").matches()).isTrue(); + assertThat(regex.matcher().test(StringEncoding.unicodeToInternal("äöüÄÖÜß🌱"))).isTrue(); + } + + @Test + public void unicodeLiteral_caseInsensitive() throws OptionsParsingException { + // Options passed on the command line are passed to convertes in the internal encoding. + var regex = + new RegexPatternConverter().convert(StringEncoding.unicodeToInternal("(?ui)äöüÄÖÜß🌱")); + assertThat(regex.regexPattern().matcher("ÄÖÜäöüß🌱").matches()).isTrue(); + assertThat(regex.matcher().test(StringEncoding.unicodeToInternal("ÄÖÜäöüß🌱"))).isTrue(); + } + + @Test + public void unicodeLiteral_suffix() throws OptionsParsingException { + // Options passed on the command line are passed to convertes in the internal encoding. + var regex = + new RegexPatternConverter().convert(StringEncoding.unicodeToInternal(".*äöüÄÖÜß🌱")); + assertThat(regex.regexPattern().matcher("äöüäöüÄÖÜß🌱").matches()).isTrue(); + assertThat(regex.matcher().test(StringEncoding.unicodeToInternal("äöüäöüÄÖÜß🌱"))).isTrue(); + } + + @Test + public void unicodeClass() throws OptionsParsingException { + // Options passed on the command line are passed to convertes in the internal encoding. + var regex = + new RegexPatternConverter().convert(StringEncoding.unicodeToInternal("\\p{L}{7}\\p{IsEmoji}")); + assertThat(regex.regexPattern().matcher("äöüÄÖÜß🌱").matches()).isTrue(); + assertThat(regex.matcher().test(StringEncoding.unicodeToInternal("äöüÄÖÜß🌱"))).isTrue(); + } }