From b36db5ad111e0f60c6032ddb7944cfd0a5424ba4 Mon Sep 17 00:00:00 2001 From: Tamas Vajk Date: Wed, 3 Jul 2024 08:09:30 +0200 Subject: [PATCH 1/2] C#: Fix glob pattern processing: allow `**/` to match empty string --- .../Semmle.Extraction.Tests/FilePathFilter.cs | 41 ++++++++++++++++--- .../Semmle.Extraction.Tests/FilePattern.cs | 6 ++- .../Semmle.Extraction/FilePattern.cs | 16 +++++++- 3 files changed, 54 insertions(+), 9 deletions(-) diff --git a/csharp/extractor/Semmle.Extraction.Tests/FilePathFilter.cs b/csharp/extractor/Semmle.Extraction.Tests/FilePathFilter.cs index efb117a263d0..ae0a371706a3 100644 --- a/csharp/extractor/Semmle.Extraction.Tests/FilePathFilter.cs +++ b/csharp/extractor/Semmle.Extraction.Tests/FilePathFilter.cs @@ -216,9 +216,9 @@ public void TestFiltersWithIncludeExcludeComplexPatterns1() var expectedRegexMessages = new[] { - "Filtering in files matching '^c/.*/i\\.[^/]*.*'. Original glob filter: 'include:c/**/i.*'", - "Filtering in files matching '^c/d/.*/[^/]*\\.cs.*'. Original glob filter: 'include:c/d/**/*.cs'", - "Filtering out files matching '^.*/z/i\\.cs.*'. Original glob filter: 'exclude:**/z/i.cs'" + "Filtering in files matching '^c/(.*/|)i\\.[^/]*.*'. Original glob filter: 'include:c/**/i.*'", + "Filtering in files matching '^c/d/(.*/|)[^/]*\\.cs.*'. Original glob filter: 'include:c/d/**/*.cs'", + "Filtering out files matching '^(.*/|)z/i\\.cs.*'. Original glob filter: 'exclude:**/z/i.cs'" }; Assert.Equivalent(expectedRegexMessages, logger.Messages, strict: false); } @@ -244,10 +244,41 @@ public void TestFiltersWithIncludeExcludeComplexPatterns2() var expectedRegexMessages = new[] { - "Filtering in files matching '^.*/i\\.[^/]*.*'. Original glob filter: 'include:**/i.*'", - "Filtering out files matching '^.*/z/i\\.cs.*'. Original glob filter: 'exclude:**/z/i.cs'" + "Filtering in files matching '^(.*/|)i\\.[^/]*.*'. Original glob filter: 'include:**/i.*'", + "Filtering out files matching '^(.*/|)z/i\\.cs.*'. Original glob filter: 'exclude:**/z/i.cs'" }; Assert.Equivalent(expectedRegexMessages, logger.Messages, strict: false); } + + [Fact] + public void TestFiltersWithIncludeExcludeComplexPatternsRelativeRoot() + { + (var testSubject, var logger, var files) = TestSetup(); + + // 'c' is the start of the relative path so we want to ensure the `**/` glob can match start + Environment.SetEnvironmentVariable("LGTM_INDEX_FILTERS", """ + include:**/c/**/i.* + exclude:**/c/**/z/i.cs + exclude:**/**/c/**/z/i.cs + """); + + var filtered = testSubject.Filter(files); + + var expected = GetExpected( + [ + "/a/b/c/x/y/i.cs", + ]); + + AssertFileInfoEquivalence(expected, filtered); + var expectedRegexMessages = new[] + { + "Filtering in files matching '^(.*/|)c/(.*/|)i\\.[^/]*.*'. Original glob filter: 'include:**/c/**/i.*'", + "Filtering out files matching '^(.*/|)c/(.*/|)z/i\\.cs.*'. Original glob filter: 'exclude:**/c/**/z/i.cs'", + "Filtering out files matching '^(.*/|)(.*/|)c/(.*/|)z/i\\.cs.*'. Original glob filter: 'exclude:**/**/c/**/z/i.cs'" + }; + + + Assert.Equivalent(expectedRegexMessages, logger.Messages, strict: false); + } } } diff --git a/csharp/extractor/Semmle.Extraction.Tests/FilePattern.cs b/csharp/extractor/Semmle.Extraction.Tests/FilePattern.cs index a4b2214b5e86..9985a6f06c68 100644 --- a/csharp/extractor/Semmle.Extraction.Tests/FilePattern.cs +++ b/csharp/extractor/Semmle.Extraction.Tests/FilePattern.cs @@ -10,13 +10,15 @@ public void TestRegexCompilation() var fp = new FilePattern("/hadoop*"); Assert.Equal("^hadoop[^/]*.*", fp.RegexPattern); fp = new FilePattern("**/org/apache/hadoop"); - Assert.Equal("^.*/org/apache/hadoop.*", fp.RegexPattern); + Assert.Equal("^(.*/|)org/apache/hadoop.*", fp.RegexPattern); fp = new FilePattern("hadoop-common/**/test// "); - Assert.Equal("^hadoop-common/.*/test(?/).*", fp.RegexPattern); + Assert.Equal("^hadoop-common/(.*/|)test(?/).*", fp.RegexPattern); fp = new FilePattern(@"-C:\agent\root\asdf//"); Assert.Equal("^C:/agent/root/asdf(?/).*", fp.RegexPattern); fp = new FilePattern(@"-C:\agent+\[root]\asdf//"); Assert.Equal(@"^C:/agent\+/\[root]/asdf(?/).*", fp.RegexPattern); + fp = new FilePattern(@"**/**/abc/**/def/**"); + Assert.Equal(@"^(.*/|)(.*/|)abc/(.*/|)def/.*", fp.RegexPattern); } [Fact] diff --git a/csharp/extractor/Semmle.Extraction/FilePattern.cs b/csharp/extractor/Semmle.Extraction/FilePattern.cs index 01815582aea9..8ab9fcd142cd 100644 --- a/csharp/extractor/Semmle.Extraction/FilePattern.cs +++ b/csharp/extractor/Semmle.Extraction/FilePattern.cs @@ -76,8 +76,20 @@ bool HasCharAt(int i, Predicate p) => throw new InvalidFilePatternException(pattern, "'**' preceeded by non-`/` character."); if (HasCharAt(i + 2, c => c != '/')) throw new InvalidFilePatternException(pattern, "'**' succeeded by non-`/` character"); - sb.Append(".*"); - i += 2; + + if (i + 2 < pattern.Length) + { + // Processing .../**/... + sb.Append("(.*/|)"); + i += 3; + } + else + { + // Processing .../** + sb.Append(".*"); + // There's no need to add another .* to the end outside the loop, we can return early. + return sb; + } } else { From 6a036f4e84b84f64396c949c135c08000bb3646d Mon Sep 17 00:00:00 2001 From: Tamas Vajk Date: Wed, 3 Jul 2024 12:45:47 +0200 Subject: [PATCH 2/2] Improve code quality --- csharp/extractor/Semmle.Extraction/FilePattern.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/extractor/Semmle.Extraction/FilePattern.cs b/csharp/extractor/Semmle.Extraction/FilePattern.cs index 8ab9fcd142cd..8c8e190a3ced 100644 --- a/csharp/extractor/Semmle.Extraction/FilePattern.cs +++ b/csharp/extractor/Semmle.Extraction/FilePattern.cs @@ -80,15 +80,15 @@ bool HasCharAt(int i, Predicate p) => if (i + 2 < pattern.Length) { // Processing .../**/... + // ^^^ sb.Append("(.*/|)"); i += 3; } else { - // Processing .../** - sb.Append(".*"); - // There's no need to add another .* to the end outside the loop, we can return early. - return sb; + // Processing .../** at the end of the pattern. + // There's no need to add .* because it's anyways added outside the loop. + break; } } else