Skip to content

Commit e1ee0fd

Browse files
committed
Fix Unicode matching for regex options
The `Pattern` class needs to operate on ordinary "Unicode" Java string, whereas both option values and strings matched against the pattern are in Bazel's internal string encoding. This requires reencoding in the right places. Along the way, optimize literal regex matches - this was missed in 662c0f0.
1 parent 5b5e41c commit e1ee0fd

File tree

17 files changed

+95
-39
lines changed

17 files changed

+95
-39
lines changed

src/main/java/com/google/devtools/build/lib/analysis/config/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ java_library(
246246
name = "execution_info_modifier",
247247
srcs = ["ExecutionInfoModifier.java"],
248248
deps = [
249+
"//src/main/java/com/google/devtools/build/lib/util:string_encoding",
249250
"//src/main/java/com/google/devtools/common/options",
250251
"//third_party:auto_value",
251252
"//third_party:guava",

src/main/java/com/google/devtools/build/lib/analysis/config/ExecutionInfoModifier.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
package com.google.devtools.build.lib.analysis.config;
1616

17+
import static com.google.devtools.build.lib.util.StringEncoding.internalToUnicode;
18+
1719
import com.google.auto.value.AutoValue;
1820
import com.google.auto.value.extension.memoized.Memoized;
1921
import com.google.common.base.Splitter;
@@ -102,7 +104,8 @@ private static ExecutionInfoModifier create(String input, ImmutableList<Expressi
102104
* Determines whether the given {@code mnemonic} (e.g. "CppCompile") matches any of the patterns.
103105
*/
104106
boolean matches(String mnemonic) {
105-
return expressions().stream().anyMatch(expr -> expr.pattern().matcher(mnemonic).matches());
107+
return expressions().stream()
108+
.anyMatch(expr -> expr.pattern().matcher(internalToUnicode(mnemonic)).matches());
106109
}
107110

108111
/** Checks whether the {@code executionInfoList} matches the {@code mnemonic}. */
@@ -139,7 +142,7 @@ public static void apply(
139142
/** Modifies the given map of {@code executionInfo} to add or remove the keys for this option. */
140143
void apply(String mnemonic, Map<String, String> executionInfo) {
141144
for (Expression expr : expressions()) {
142-
if (expr.pattern().matcher(mnemonic).matches()) {
145+
if (expr.pattern().matcher(internalToUnicode(mnemonic)).matches()) {
143146
if (expr.remove()) {
144147
executionInfo.remove(expr.key());
145148
} else {

src/main/java/com/google/devtools/build/lib/events/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ java_library(
1515
name = "events",
1616
srcs = glob(["*.java"]),
1717
deps = [
18+
"//src/main/java/com/google/devtools/build/lib/util:string_encoding",
1819
"//src/main/java/com/google/devtools/build/lib/util/io:out-err",
1920
"//src/main/java/net/starlark/java/eval",
2021
"//src/main/java/net/starlark/java/syntax",

src/main/java/com/google/devtools/build/lib/events/OutputFilter.java

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,32 +14,19 @@
1414

1515
package com.google.devtools.build.lib.events;
1616

17+
import com.google.devtools.build.lib.util.StringEncoding;
1718
import java.util.regex.Pattern;
1819

19-
/**
20-
* An output filter for warnings.
21-
*/
20+
/** An output filter for warnings. */
2221
public interface OutputFilter {
2322

2423
/** An output filter that matches everything. */
25-
public static final OutputFilter OUTPUT_EVERYTHING = new OutputFilter() {
26-
@Override
27-
public boolean showOutput(String tag) {
28-
return true;
29-
}
30-
};
24+
OutputFilter OUTPUT_EVERYTHING = tag -> true;
3125

3226
/** An output filter that matches nothing. */
33-
public static final OutputFilter OUTPUT_NOTHING = new OutputFilter() {
34-
@Override
35-
public boolean showOutput(String tag) {
36-
return false;
37-
}
38-
};
27+
OutputFilter OUTPUT_NOTHING = tag -> false;
3928

40-
/**
41-
* Returns true iff the given tag matches the output filter.
42-
*/
29+
/** Returns true iff the given tag matches the output filter. */
4330
boolean showOutput(String tag);
4431

4532
/** An output filter using regular expression matching. */
@@ -62,7 +49,7 @@ private RegexOutputFilter(Pattern pattern) {
6249

6350
@Override
6451
public boolean showOutput(String tag) {
65-
return pattern.matcher(tag).find();
52+
return pattern.matcher(StringEncoding.internalToUnicode(tag)).find();
6653
}
6754

6855
@Override

src/main/java/com/google/devtools/build/lib/exec/local/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ java_library(
3939
"//src/main/java/com/google/devtools/build/lib/shell",
4040
"//src/main/java/com/google/devtools/build/lib/util",
4141
"//src/main/java/com/google/devtools/build/lib/util:os",
42+
"//src/main/java/com/google/devtools/build/lib/util:string_encoding",
4243
"//src/main/java/com/google/devtools/build/lib/util/io",
4344
"//src/main/java/com/google/devtools/build/lib/vfs",
4445
"//src/main/protobuf:failure_details_java_proto",

src/main/java/com/google/devtools/build/lib/exec/local/LocalSpawnRunner.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import com.google.devtools.build.lib.shell.SubprocessBuilder;
5656
import com.google.devtools.build.lib.shell.TerminationStatus;
5757
import com.google.devtools.build.lib.util.NetUtil;
58+
import com.google.devtools.build.lib.util.StringEncoding;
5859
import com.google.devtools.build.lib.util.io.FileOutErr;
5960
import com.google.devtools.build.lib.vfs.Path;
6061
import com.google.errorprone.annotations.FormatMethod;
@@ -265,7 +266,7 @@ private SpawnResult runOnce() throws InterruptedException, ExecException, IOExce
265266

266267
@FormatMethod
267268
private void stepLog(Level level, @FormatString String fmt, Object... args) {
268-
stepLog(level, /*cause=*/ null, fmt, args);
269+
stepLog(level, /* cause= */ null, fmt, args);
269270
}
270271

271272
@FormatMethod
@@ -322,7 +323,8 @@ private SpawnResult start() throws InterruptedException, ExecException, IOExcept
322323
("Action type "
323324
+ actionType
324325
+ " is not allowed to run locally due to regex filter: "
325-
+ localExecutionOptions.allowedLocalAction.regexPattern()
326+
+ StringEncoding.unicodeToInternal(
327+
localExecutionOptions.allowedLocalAction.regexPattern().toString())
326328
+ "\n")
327329
.getBytes(UTF_8));
328330
spawnMetrics.setTotalTime(totalTimeStopwatch.elapsed());

src/main/java/com/google/devtools/build/lib/unsafe/StringUnsafe.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,9 @@ public static byte[] getInternalStringBytes(String obj) {
9191
// Truncation is ASCII only and thus doesn't change the encoding.
9292
String truncatedString = Ascii.truncate(obj, 1000, "...");
9393
throw new IllegalArgumentException(
94-
"Expected internal string with Latin-1 coder, got: %s (%s)"
95-
.formatted(truncatedString, Arrays.toString(getByteArray(truncatedString))));
94+
String.format(
95+
"Expected internal string with Latin-1 coder, got: %s (%s)",
96+
truncatedString, Arrays.toString(getByteArray(truncatedString))));
9697
}
9798
return getByteArray(obj);
9899
}

src/main/java/com/google/devtools/build/lib/util/regex/BUILD

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,7 @@ filegroup(
1414
java_library(
1515
name = "regex_util",
1616
srcs = ["RegexUtil.java"],
17+
deps = [
18+
"//src/main/java/com/google/devtools/build/lib/util:string_encoding",
19+
],
1720
)

src/main/java/com/google/devtools/build/lib/util/regex/RegexUtil.java

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
// limitations under the License.
1414
package com.google.devtools.build.lib.util.regex;
1515

16+
import static com.google.devtools.build.lib.util.StringEncoding.internalToUnicode;
17+
import static com.google.devtools.build.lib.util.StringEncoding.unicodeToInternal;
18+
1619
import java.util.function.Predicate;
1720
import java.util.regex.Matcher;
1821
import java.util.regex.Pattern;
@@ -31,7 +34,8 @@ public final class RegexUtil {
3134
Pattern.compile("(?:\\^|\\\\A)*\\.\\*(?<suffix>[^?+].*)?");
3235

3336
/**
34-
* Returns a {@link Predicate} that matches the input string against the regex pattern with the
37+
* Returns a {@link Predicate} that matches the input string (in internal encoding according to
38+
* {@link com.google.devtools.build.lib.util.StringEncoding}) against the regex pattern with the
3539
* same semantics as {@link Matcher#matches()}, but more optimized and as if the pattern was
3640
* compiled with {@link Pattern#DOTALL}.
3741
*/
@@ -43,15 +47,19 @@ public static Predicate<String> asOptimizedMatchingPredicate(Pattern regexPatter
4347
// engine's optimization pass exploited below - it only applies when the first node is a
4448
// literal, not a group.
4549
// Unmatched \Q...\E sequences can have the same effect.
46-
return s -> regexPattern.matcher(s).matches();
50+
return s -> regexPattern.matcher(internalToUnicode(s)).matches();
51+
}
52+
// If the pattern matches a literal string, optimize to an equals call.
53+
if (LITERAL_PATTERN_WITH_DOT_UNESCAPED.matcher(pattern).matches()) {
54+
return unicodeToInternal(pattern.replace("\\.", "."))::equals;
4755
}
4856
// Recognize a pattern that starts with ".*" and drop it so that the engine sees the next part
4957
// of the pattern as the beginning and potentially optimizes the literal search.
5058
// We don't apply the same transformation to the end of the pattern since it is harder to
5159
// determine if a ".*" suffix is safe to drop due to backslash escapes.
5260
Matcher suffixMatch = EXTRACT_SUFFIX_MATCH.matcher(pattern);
5361
if (!suffixMatch.matches()) {
54-
return s -> regexPattern.matcher(s).matches();
62+
return s -> regexPattern.matcher(internalToUnicode(s)).matches();
5563
}
5664
String suffixPattern = suffixMatch.group("suffix");
5765
// A null suffixPattern implies the regex is equivalent to ".*", which matches any input string.
@@ -61,14 +69,14 @@ public static Predicate<String> asOptimizedMatchingPredicate(Pattern regexPatter
6169
// If the pattern matches a literal suffix, optimize to a string suffix
6270
// match, which is by far the fastest way to match.
6371
if (LITERAL_PATTERN_WITH_DOT_UNESCAPED.matcher(suffixPattern).matches()) {
64-
String literalSuffix = suffixPattern.replace("\\.", ".");
72+
String literalSuffix = unicodeToInternal(suffixPattern.replace("\\.", "."));
6573
return s -> s.endsWith(literalSuffix);
6674
}
6775
// Turn the "match" pattern into an equivalent "find" pattern, since these are the only ones
6876
// that benefit from the Boyer-Moore optimization in the Java regex engine.
6977
// https://github.com/openjdk/jdk/blob/50dced88ff1aed23bb4c8fe9e4a08e6cc200b897/src/java.base/share/classes/java/util/regex/Pattern.java#L1959-L1969
7078
Pattern compiled = Pattern.compile(suffixPattern + "\\z", Pattern.DOTALL);
71-
return s -> compiled.matcher(s).find();
79+
return s -> compiled.matcher(internalToUnicode(s)).find();
7280
}
7381

7482
private RegexUtil() {}

src/main/java/com/google/devtools/common/options/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ java_library(
5050
),
5151
deps = [
5252
"//src/main/java/com/google/devtools/build/lib/util:pair",
53+
"//src/main/java/com/google/devtools/build/lib/util:string_encoding",
5354
"//src/main/java/com/google/devtools/build/lib/util/regex:regex_util",
5455
"//src/main/java/net/starlark/java/spelling",
5556
"//third_party:auto_value",

src/main/java/com/google/devtools/common/options/Converters.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import com.google.common.collect.ImmutableMap;
2323
import com.google.common.collect.ImmutableSet;
2424
import com.google.common.collect.Maps;
25+
import com.google.devtools.build.lib.util.StringEncoding;
2526
import java.time.Duration;
2627
import java.util.Iterator;
2728
import java.util.List;
@@ -422,7 +423,8 @@ public static class RegexPatternConverter extends Converter.Contextless<RegexPat
422423
@Override
423424
public RegexPatternOption convert(String input) throws OptionsParsingException {
424425
try {
425-
return RegexPatternOption.create(Pattern.compile(input, Pattern.DOTALL));
426+
return RegexPatternOption.create(
427+
Pattern.compile(StringEncoding.internalToUnicode(input), Pattern.DOTALL));
426428
} catch (PatternSyntaxException e) {
427429
throw new OptionsParsingException("Not a valid regular expression: " + e.getMessage());
428430
}

src/main/java/com/google/devtools/common/options/RegexPatternOption.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,13 @@ static RegexPatternOption create(Pattern regexPattern) {
3636
RegexUtil.asOptimizedMatchingPredicate(regexPattern));
3737
}
3838

39-
/** The original regex pattern. */
39+
/**
40+
* The original regex pattern.
41+
*
42+
* <p>Note: Strings passed to the {@link Pattern} and {@link java.util.regex.Matcher} API have to
43+
* be converted to "Unicode" form first (see {@link
44+
* com.google.devtools.build.lib.util.StringEncoding#internalToUnicode}.
45+
*/
4046
public abstract Pattern regexPattern();
4147

4248
/**

src/test/java/com/google/devtools/build/lib/util/regex/BUILD

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ java_binary(
3838
)
3939

4040
# Start fuzzing via
41-
# bazel run //src/test/java/com/google/devtools/build/lib/util:RegexUtilFuzzTest_run
41+
# bazel run //src/test/java/com/google/devtools/build/lib/util/regex:RegexUtilFuzzTest_run
4242
java_fuzz_test(
4343
name = "RegexUtilFuzzTest",
4444
srcs = ["RegexUtilFuzzTest.java"],
@@ -53,6 +53,7 @@ java_fuzz_test(
5353
"notap",
5454
],
5555
deps = [
56+
"//src/main/java/com/google/devtools/build/lib/util:string_encoding",
5657
"//src/main/java/com/google/devtools/build/lib/util/regex:regex_util",
5758
"//third_party:jmh",
5859
],

src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilFuzzTest.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
// limitations under the License.
1414
package com.google.devtools.build.lib.util.regex;
1515

16+
import static com.google.devtools.build.lib.util.StringEncoding.unicodeToInternal;
17+
1618
import com.code_intelligence.jazzer.api.FuzzedDataProvider;
1719
import java.util.function.Predicate;
1820
import java.util.regex.Pattern;
@@ -35,7 +37,9 @@ public static void fuzzerTestOneInput(FuzzedDataProvider data) {
3537
}
3638

3739
Predicate<String> optimizedMatcher = RegexUtil.asOptimizedMatchingPredicate(originalPattern);
38-
if (optimizedMatcher.test(haystack) != originalPattern.matcher(haystack).matches()) {
40+
boolean originalMatches = originalPattern.matcher(haystack).matches();
41+
boolean optimizedMatches = optimizedMatcher.test(unicodeToInternal(haystack));
42+
if (originalMatches != optimizedMatches) {
3943
throw new AssertionError(
4044
"""
4145
Optimized matcher and original matcher differ in behavior:
@@ -44,11 +48,7 @@ public static void fuzzerTestOneInput(FuzzedDataProvider data) {
4448
originalPattern.matcher(haystack).matches(): %s
4549
optimizedMatcher.test(haystack): %s
4650
"""
47-
.formatted(
48-
needle,
49-
haystack,
50-
originalPattern.matcher(haystack).matches(),
51-
optimizedMatcher.test(haystack)));
51+
.formatted(needle, haystack, originalMatches, optimizedMatches));
5252
}
5353
}
5454
}

src/test/java/com/google/devtools/build/lib/util/regex/RegexUtilTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ public void optimizedMatchingPredicate(
3434
"a",
3535
"foo",
3636
"foofoo",
37+
"coverage.dat",
3738
"/coverage.dat",
3839
"/coverage.data",
3940
"/coverage1dat",
@@ -55,6 +56,7 @@ public void optimizedMatchingPredicate(
5556
".*?foo",
5657
".*+foo",
5758
"^foo$",
59+
"coverage\\.dat",
5860
".*/coverage.dat",
5961
".*/coverage\\.dat",
6062
".*/test/.*/coverage\\.dat",

src/test/java/com/google/devtools/common/options/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ java_test(
4343
deps = [
4444
":testutils",
4545
"//src/main/java/com/google/devtools/build/lib/util:classpath",
46+
"//src/main/java/com/google/devtools/build/lib/util:string_encoding",
4647
"//src/main/java/com/google/devtools/common/options",
4748
"//src/main/java/com/google/devtools/common/options:invocation_policy",
4849
"//src/main/java/com/google/devtools/common/options/testing",

src/test/java/com/google/devtools/common/options/RegexPatternConverterTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import static com.google.common.truth.Truth.assertThat;
1717
import static org.junit.Assert.assertThrows;
1818

19+
import com.google.devtools.build.lib.util.StringEncoding;
1920
import com.google.devtools.common.options.Converters.RegexPatternConverter;
2021
import com.google.devtools.common.options.testing.ConverterTester;
2122
import com.google.testing.junit.testparameterinjector.TestParameterInjector;
@@ -61,4 +62,39 @@ public void throwsForWrongPattern() {
6162
assertThrows(OptionsParsingException.class, () -> new RegexPatternConverter().convert("{"));
6263
assertThat(e).hasMessageThat().startsWith("Not a valid regular expression:");
6364
}
65+
66+
@Test
67+
public void unicodeLiteral() throws OptionsParsingException {
68+
// Options passed on the command line are passed to convertes in the internal encoding.
69+
var regex = new RegexPatternConverter().convert(StringEncoding.unicodeToInternal("äöüÄÖÜß🌱"));
70+
assertThat(regex.regexPattern().matcher("äöüÄÖÜß🌱").matches()).isTrue();
71+
assertThat(regex.matcher().test(StringEncoding.unicodeToInternal("äöüÄÖÜß🌱"))).isTrue();
72+
}
73+
74+
@Test
75+
public void unicodeLiteral_caseInsensitive() throws OptionsParsingException {
76+
// Options passed on the command line are passed to convertes in the internal encoding.
77+
var regex =
78+
new RegexPatternConverter().convert(StringEncoding.unicodeToInternal("(?ui)äöüÄÖÜß🌱"));
79+
assertThat(regex.regexPattern().matcher("ÄÖÜäöüß🌱").matches()).isTrue();
80+
assertThat(regex.matcher().test(StringEncoding.unicodeToInternal("ÄÖÜäöüß🌱"))).isTrue();
81+
}
82+
83+
@Test
84+
public void unicodeLiteral_suffix() throws OptionsParsingException {
85+
// Options passed on the command line are passed to convertes in the internal encoding.
86+
var regex =
87+
new RegexPatternConverter().convert(StringEncoding.unicodeToInternal(".*äöüÄÖÜß🌱"));
88+
assertThat(regex.regexPattern().matcher("äöüäöüÄÖÜß🌱").matches()).isTrue();
89+
assertThat(regex.matcher().test(StringEncoding.unicodeToInternal("äöüäöüÄÖÜß🌱"))).isTrue();
90+
}
91+
92+
@Test
93+
public void unicodeClass() throws OptionsParsingException {
94+
// Options passed on the command line are passed to convertes in the internal encoding.
95+
var regex =
96+
new RegexPatternConverter().convert(StringEncoding.unicodeToInternal("\\p{L}{7}\\p{IsEmoji}"));
97+
assertThat(regex.regexPattern().matcher("äöüÄÖÜß🌱").matches()).isTrue();
98+
assertThat(regex.matcher().test(StringEncoding.unicodeToInternal("äöüÄÖÜß🌱"))).isTrue();
99+
}
64100
}

0 commit comments

Comments
 (0)