dotnet-installer/src/SourceBuild/content/test/Microsoft.DotNet.SourceBuild.SmokeTests/LicenseScanTests.cs
2024-07-16 08:14:13 -06:00

296 lines
16 KiB
C#

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Text.RegularExpressions;
using Xunit;
using Xunit.Abstractions;
namespace Microsoft.DotNet.SourceBuild.SmokeTests;
/// <summary>
/// Scans the VMR for licenses and compares them to a baseline. This ensures that only open-source licenses are used for relevant files.
/// </summary>
/// <remarks>
/// Each sub-repo of the VMR is scanned separately because of the amount of time it takes.
/// When scanning is run, the test provides a list of files for the scanner to ignore. These include binary file types. It also includes
/// .il/.ildump file types which are massive, causing the scanner to choke and don't include license references anyway.
/// Once the scanner returns the results, a filtering process occurs. First, any detected license that is in the allowed list of licenses
/// is filtered out. The test defines a list of such licenses that all represent open-source licenses. Next, a license exclusions file is
/// applied to the filtering. This file contains a set of paths for which certain detected licenses are to be ignored. Such a path can be
/// defined to ignore all detected licenses or specific ones. These exclusions are useful for ignoring false positives where the scanning
/// tool has detected something in the file that makes it think it's a license reference when that's not actually the intent. Other cases
/// that are excluded are when the license is meant as configuration or test data and not actually applying to the code. These exclusions
/// further filter down the set of the detected licenses for each file. Everything that's left at this point is reported. It gets compared
/// to a baseline file (which is defined for each sub-repo). If the filtered results differ from what's defined in the baseline, the test fails.
///
/// Rules for determining how to resolve a detected license:
/// 1. If it's an allowed open-source license, add it to the list of allowed licenses in LicenseScanTests.cs.
/// 2. If the file shouldn't be scanned as a general rule because of its file type (e.g. image file), add it to the list of excluded file types in LicenseScanTests.cs.
/// 3. Add it to LicenseExclusions.txt if the referenced license is one of the following:
/// a. Not applicable (e.g. test data)
/// b. False positive
/// 4. If the license is not allowed for open-souce, the license needs to be fixed. Everything else should go in the baseline file.
/// </remarks>
public class LicenseScanTests : TestBase
{
private const string BaselineSubDir = "licenses";
private static readonly string[] s_allowedLicenseExpressions = new string[]
{
"apache-1.1", // https://opensource.org/license/apache-1-1/
"apache-2.0", // https://opensource.org/license/apache-2-0/
"apache-2.0 WITH apple-runtime-library-exception", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/apple-runtime-library-exception.LICENSE
"apache-2.0 WITH llvm-exception", // https://foundation.llvm.org/relicensing/LICENSE.txt
"apsl-2.0", // https://opensource.org/license/apsl-2-0-php/
"boost-1.0", // https://opensource.org/license/bsl-1-0/
"bsd-new", // https://opensource.org/license/BSD-3-clause/
"bsd-original", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/bsd-original.LICENSE
"bsd-original-uc", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/bsd-original-uc.LICENSE
"bsd-simplified", // https://opensource.org/license/bsd-2-clause/
"bytemark", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/bytemark.LICENSE
"bzip2-libbzip-2010", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/bzip2-libbzip-2010.LICENSE
"cc0-1.0", // https://creativecommons.org/publicdomain/zero/1.0/legalcode
"cc-by-3.0", // https://creativecommons.org/licenses/by/3.0/legalcode
"cc-by-sa-3.0", // https://creativecommons.org/licenses/by-sa/3.0/legalcode
"cc-by-sa-4.0", // https://creativecommons.org/licenses/by-sa/4.0/legalcode
"cc-pd", // https://creativecommons.org/publicdomain/mark/1.0/
"cc-sa-1.0", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/cc-sa-1.0.LICENSE
"epl-1.0", // https://opensource.org/license/epl-1-0/
"generic-cla", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/generic-cla.LICENSE
"gpl-1.0-plus", // https://opensource.org/license/gpl-1-0/
"gpl-2.0", // https://opensource.org/license/gpl-2-0/
"ietf", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/ietf.LICENSE
"gpl-2.0-plus WITH autoconf-simple-exception-2.0", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/rules/gpl-2.0-plus_with_autoconf-simple-exception-2.0_8.RULE
"gpl-2.0 WITH gcc-linking-exception-2.0", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/rules/gpl-2.0_with_gcc-linking-exception-2.0_6.RULE
"isc", // https://opensource.org/license/isc-license-txt/
"iso-8879", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/iso-8879.LICENSE
"lgpl-2.0-plus", // https://opensource.org/license/lgpl-2-0/
"lgpl-2.1", // https://opensource.org/license/lgpl-2-1/
"lgpl-2.1-plus", // https://opensource.org/license/lgpl-2-1/
"lzma-sdk-9.22", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/lzma-sdk-9.22.LICENSE
"mit", // https://opensource.org/license/mit/
"mit-addition", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/mit-addition.LICENSE
"mit-testregex", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/mit-testregex.LICENSE
"ms-patent-promise", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/ms-patent-promise.LICENSE
"ms-lpl", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/ms-lpl.LICENSE
"ms-pl", // https://opensource.org/license/ms-pl-html/
"ms-rl", // https://opensource.org/license/ms-rl-html/
"newton-king-cla", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/newton-king-cla.LICENSE
"ngpl", // https://opensource.org/license/nethack-php/
"object-form-exception-to-mit", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/object-form-exception-to-mit.LICENSE
"ofl-1.1", // https://opensource.org/license/ofl-1-1/
"osf-1990", // https://fedoraproject.org/wiki/Licensing:MIT?rd=Licensing/MIT#HP_Variant
"pcre2-exception", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/pcre2-exception.LICENSE
"public-domain", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/public-domain.LICENSE
"public-domain-disclaimer", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/public-domain-disclaimer.LICENSE
"python", // https://opensource.org/license/python-2-0/
"rpl-1.5", // https://opensource.org/license/rpl-1-5/
"sax-pd", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/sax-pd.LICENSE
"unicode", // https://opensource.org/license/unicode-inc-license-agreement-data-files-and-software/
"unicode-mappings", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/unicode-mappings.LICENSE
"uoi-ncsa", // https://opensource.org/license/uoi-ncsa-php/
"w3c-software-19980720", // https://opensource.org/license/w3c/
"w3c-software-doc-20150513", // https://opensource.org/license/w3c/
"warranty-disclaimer", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/warranty-disclaimer.LICENSE
"x11", // https://github.com/nexB/scancode-toolkit/blob/develop/src/licensedcode/data/licenses/x11.LICENSE
"zlib" // https://opensource.org/license/zlib/
};
private static readonly string[] s_ignoredFilePatterns = new string[]
{
"*.bin",
"*.bmp",
"*.bson",
"*.db",
"*.dic",
"*.eot",
"*.gif",
"*.ico",
"*.jpg",
"*.il",
"*.ildump",
"*.lss",
"*.nlp",
"*.otf",
"*.pdf",
"*.pfx",
"*.png",
"*.snk",
"*.ttf",
"*.vsd",
"*.vsdx",
"*.winmd",
"*.woff",
"*.woff2",
"*.xlsx",
};
private readonly string _targetRepo;
public LicenseScanTests(ITestOutputHelper outputHelper) : base(outputHelper)
{
Assert.NotNull(Config.LicenseScanPath);
_targetRepo = new DirectoryInfo(Config.LicenseScanPath).Name;
}
[SkippableFact(Config.LicenseScanPathEnv, skipOnNullOrWhiteSpaceEnv: true)]
public void ScanForLicenses()
{
Assert.NotNull(Config.LicenseScanPath);
// Indicates how long until a timeout occurs for scanning a given file
const int FileScanTimeoutSeconds = 240;
string scancodeResultsPath = Path.Combine(LogsDirectory, "scancode-results.json");
// Scancode Doc: https://scancode-toolkit.readthedocs.io/en/latest/index.html
string ignoreOptions = string.Join(" ", s_ignoredFilePatterns.Select(pattern => $"--ignore {pattern}"));
ExecuteHelper.ExecuteProcessValidateExitCode(
"scancode",
$"--license --processes 4 --timeout {FileScanTimeoutSeconds} --strip-root --only-findings {ignoreOptions} --json-pp {scancodeResultsPath} {Config.LicenseScanPath}",
OutputHelper);
JsonDocument doc = JsonDocument.Parse(File.ReadAllText(scancodeResultsPath));
ScancodeResults? scancodeResults = doc.Deserialize<ScancodeResults>();
Assert.NotNull(scancodeResults);
FilterFiles(scancodeResults);
JsonSerializerOptions options = new()
{
WriteIndented = true
};
string json = JsonSerializer.Serialize(scancodeResults, options);
string baselineName = $"Licenses.{_targetRepo}.json";
string baselinePath = BaselineHelper.GetBaselineFilePath(baselineName, BaselineSubDir);
if (!File.Exists(baselinePath))
{
Assert.Fail($"No license baseline file exists for repo '{_targetRepo}'. Expected file: {baselinePath}");
}
BaselineHelper.CompareBaselineContents(baselineName, json, OutputHelper, Config.WarnOnLicenseScanDiffs, BaselineSubDir);
}
private LicenseExclusion ParseLicenseExclusion(string rawExclusion)
{
string[] parts = rawExclusion.Split('|', StringSplitOptions.RemoveEmptyEntries);
Match repoNameMatch = Regex.Match(parts[0], @"(?<=src/)[^/]+");
Assert.True(repoNameMatch.Success);
// The path in the exclusion file is rooted from the VMR. But the path in the scancode results is rooted from the
// target repo within the VMR. So we need to strip off the beginning part of the path.
Match restOfPathMatch = Regex.Match(parts[0], @"(?<=src/[^/]+/).*");
string path = restOfPathMatch.Value;
if (parts.Length == 0 || parts.Length > 2)
{
throw new Exception($"Invalid license exclusion: '{rawExclusion}'");
}
if (parts.Length > 1)
{
string[] licenseExpressions = parts[1].Split(',', StringSplitOptions.RemoveEmptyEntries);
return new LicenseExclusion(repoNameMatch.Value, path, licenseExpressions);
}
else
{
return new LicenseExclusion(repoNameMatch.Value, path, Enumerable.Empty<string>());
}
}
private void FilterFiles(ScancodeResults scancodeResults)
{
IEnumerable<string> rawExclusions = Utilities.ParseExclusionsFile("LicenseExclusions.txt");
IEnumerable<LicenseExclusion> exclusions = rawExclusions
.Select(exclusion => ParseLicenseExclusion(exclusion))
.Where(exclusion => exclusion.Repo == _targetRepo)
.ToList();
// This will filter out files that we don't want to include in the baseline.
// Filtering can happen in two ways:
// 1. There are a set of allowed license expressions that apply to all files. If a file has a match on one of those licenses,
// that license will not be considered.
// 2. The LicenseExclusions.txt file contains a list of files and the licenses that should be excluded from those files.
// Once the license expression filtering has been applied, if a file has any licenses left, it will be included in the baseline.
// In that case, the baseline will list all of the licenses for that file, even if some were originally excluded during this processing.
// In other words, the baseline will be fully representative of the licenses that apply to the files that are listed there.
for (int i = scancodeResults.Files.Count - 1; i >= 0; i--)
{
ScancodeFileResult file = scancodeResults.Files[i];
// A license expression can be a logical expression, e.g. "(MIT OR Apache-2.0)"
// For our purposes, we just care about the license involved, not the semantics of the expression.
// Parse out all the expression syntax to just get the license names.
string[] licenses = file.LicenseExpression?
.Replace("(", string.Empty)
.Replace(")", string.Empty)
.Replace(" AND ", ",")
.Replace(" OR ", ",")
.Split(",", StringSplitOptions.RemoveEmptyEntries)
.Select(license => license.Trim())
.ToArray()
?? Array.Empty<string>();
// First check whether the file's licenses can all be matched with allowed expressions
IEnumerable<string> disallowedLicenses = licenses
.Where(license => !s_allowedLicenseExpressions.Contains(license, StringComparer.OrdinalIgnoreCase));
if (!disallowedLicenses.Any())
{
scancodeResults.Files.Remove(file);
}
else
{
// There are some licenses that are not allowed. Now check whether the file is excluded.
IEnumerable<LicenseExclusion> matchingExclusions =
Utilities.GetMatchingFileExclusions(file.Path, exclusions, exclusion => exclusion.Path);
IEnumerable<string> excludedLicenses = matchingExclusions.SelectMany(exclusion => exclusion.LicenseExpressions);
// If no licenses are explicitly specified, it means they're all excluded.
if (matchingExclusions.Any() && !excludedLicenses.Any())
{
scancodeResults.Files.Remove(file);
}
else
{
IEnumerable<string> remainingLicenses = disallowedLicenses.Except(excludedLicenses);
if (!remainingLicenses.Any())
{
scancodeResults.Files.Remove(file);
}
}
}
}
}
private record LicenseExclusion(string Repo, string Path, IEnumerable<string> LicenseExpressions);
private class ScancodeResults
{
[JsonPropertyName("files")]
public List<ScancodeFileResult> Files { get; set; } = new();
}
private class ScancodeFileResult
{
[JsonPropertyName("path")]
public string Path { get; set; } = string.Empty;
[JsonPropertyName("detected_license_expression")]
public string? LicenseExpression { get; set; }
}
}