Rollup merge of #154823 - jakubadamw:spdx-rs-replacement, r=Mark-Simulacrum Replace the spdx-rs dependency with a minimal in-tree SPDX tag-value parser The spdx-rs crate [is no longer maintained](https://github.com/doubleopen-project/spdx-rs/pulls) and is behind on its own dependency updates. It is currently used in [the collect-license-metadata tool](https://github.com/rust-lang/rust/tree/main/src/tools/collect-license-metadata), employing a single function therefrom: `spdx_rs::parsers::spdx_from_tag_value`, which parses the output of the `reuse` tool to extract file names, licences and copyright text. This PR replaces the use of said function with a small minimal parser that handles just the subset of the SPDX tag-value format that is needed: `Tag: Value` line pairs and multi-line `<text>...</text>` blocks. Coincidentally, this gets rid of the last transitive dependency on syn v1.
diff --git a/Cargo.lock b/Cargo.lock index 3b56dd6..5a9764b 100644 --- a/Cargo.lock +++ b/Cargo.lock
@@ -199,7 +199,7 @@ "rustc-hash 2.1.1", "serde", "serde_derive", - "syn 2.0.110", + "syn", ] [[package]] @@ -396,7 +396,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -573,7 +573,6 @@ dependencies = [ "iana-time-zone", "num-traits", - "serde", "windows-link 0.2.1", ] @@ -635,10 +634,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -768,7 +767,6 @@ "serde", "serde_json", "similar", - "spdx-rs", ] [[package]] @@ -804,7 +802,7 @@ "nom", "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -1047,7 +1045,7 @@ "proc-macro2", "quote", "scratch", - "syn 2.0.110", + "syn", ] [[package]] @@ -1061,7 +1059,7 @@ "indexmap", "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -1079,7 +1077,7 @@ "indexmap", "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -1103,7 +1101,7 @@ "proc-macro2", "quote", "strsim", - "syn 2.0.110", + "syn", ] [[package]] @@ -1114,7 +1112,7 @@ dependencies = [ "darling_core", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -1146,7 +1144,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -1167,7 +1165,7 @@ "darling", "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -1177,7 +1175,7 @@ checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.110", + "syn", ] [[package]] @@ -1189,7 +1187,7 @@ "darling", "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -1264,7 +1262,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -1688,12 +1686,6 @@ [[package]] name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - -[[package]] -name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" @@ -2092,7 +2084,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -2389,7 +2381,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -2534,7 +2526,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -2915,7 +2907,7 @@ "pest_meta", "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -3125,7 +3117,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -3316,7 +3308,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -3409,7 +3401,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -4125,7 +4117,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -4271,7 +4263,7 @@ "fluent-syntax", "proc-macro2", "quote", - "syn 2.0.110", + "syn", "synstructure", ] @@ -4852,7 +4844,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", "synstructure", ] @@ -4952,7 +4944,7 @@ "proc-macro2", "quote", "serde", - "syn 2.0.110", + "syn", ] [[package]] @@ -5069,7 +5061,7 @@ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.110", + "syn", ] [[package]] @@ -5155,7 +5147,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -5166,7 +5158,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -5300,35 +5292,6 @@ ] [[package]] -name = "spdx-expression" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d7ac03c67c572d85049d6db815e20a4a19b41b3d5cca732ac582342021ad77" -dependencies = [ - "nom", - "serde", - "thiserror 1.0.69", - "tracing", -] - -[[package]] -name = "spdx-rs" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990870190ec8d8c64ba66e4a6746243d6e57d99353991e0e6092334833f429b1" -dependencies = [ - "chrono", - "log", - "nom", - "serde", - "spdx-expression", - "strum", - "strum_macros", - "thiserror 1.0.69", - "uuid", -] - -[[package]] name = "stable_deref_trait" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -5394,36 +5357,6 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] -name = "strum" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" - -[[package]] -name = "strum_macros" -version = "0.24.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "rustversion", - "syn 1.0.109", -] - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] name = "syn" version = "2.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -5442,7 +5375,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -5579,7 +5512,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -5590,7 +5523,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -5832,7 +5765,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -6028,7 +5961,7 @@ dependencies = [ "proc-macro-hack", "quote", - "syn 2.0.110", + "syn", "unic-langid-impl", ] @@ -6266,7 +6199,7 @@ "bumpalo", "proc-macro2", "quote", - "syn 2.0.110", + "syn", "wasm-bindgen-shared", ] @@ -6523,7 +6456,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -6534,7 +6467,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -6904,7 +6837,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", "synstructure", ] @@ -6925,7 +6858,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]] @@ -6945,7 +6878,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", "synstructure", ] @@ -6980,7 +6913,7 @@ dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn", ] [[package]]
diff --git a/src/tools/collect-license-metadata/Cargo.toml b/src/tools/collect-license-metadata/Cargo.toml index f84da24..ae41d2f 100644 --- a/src/tools/collect-license-metadata/Cargo.toml +++ b/src/tools/collect-license-metadata/Cargo.toml
@@ -10,4 +10,3 @@ serde = { version = "1.0.147", features = ["derive"] } serde_json = "1.0.85" similar = "2.7.0" -spdx-rs = "0.5.1"
diff --git a/src/tools/collect-license-metadata/src/main.rs b/src/tools/collect-license-metadata/src/main.rs index 4e218ea..156871b 100644 --- a/src/tools/collect-license-metadata/src/main.rs +++ b/src/tools/collect-license-metadata/src/main.rs
@@ -1,6 +1,7 @@ mod licenses; mod path_tree; mod reuse; +mod spdx; use std::path::PathBuf;
diff --git a/src/tools/collect-license-metadata/src/reuse.rs b/src/tools/collect-license-metadata/src/reuse.rs index dbe4678..6bc4145 100644 --- a/src/tools/collect-license-metadata/src/reuse.rs +++ b/src/tools/collect-license-metadata/src/reuse.rs
@@ -15,18 +15,15 @@ pub(crate) fn collect( let raw = &obtain_spdx_document(reuse_exe)?; println!("finished gathering the license information from REUSE in {:.2?}", start.elapsed()); - let document = spdx_rs::parsers::spdx_from_tag_value(&raw)?; + let files = crate::spdx::parse_tag_value(raw)?; let mut result = Vec::new(); - for file in document.file_information { - let concluded_license = file.concluded_license.expect("File should have licence info"); - let copyright_text = file.copyright_text.expect("File should have copyright text"); + for file in files { let license = interner.intern(License { - spdx: concluded_license.to_string(), - copyright: copyright_text.split('\n').map(|s| s.into()).collect(), + spdx: file.concluded_license, + copyright: file.copyright_text.split('\n').map(|s| s.into()).collect(), }); - - result.push((file.file_name.into(), license)); + result.push((file.name.into(), license)); } Ok(result)
diff --git a/src/tools/collect-license-metadata/src/spdx/mod.rs b/src/tools/collect-license-metadata/src/spdx/mod.rs new file mode 100644 index 0000000..a94f2bc --- /dev/null +++ b/src/tools/collect-license-metadata/src/spdx/mod.rs
@@ -0,0 +1,102 @@ +use anyhow::Error; + +/// A single file entry extracted from an SPDX tag-value document. +pub(crate) struct SpdxFileEntry { + pub(crate) name: String, + pub(crate) concluded_license: String, + pub(crate) copyright_text: String, +} + +/// Parses an SPDX tag-value document and extracts file information. +/// +/// This is a minimal parser that only extracts the fields we need +/// (FileName, LicenseConcluded, FileCopyrightText) rather than the full model. +/// The format is specified by the SPDX specification: +/// each line is a `Tag: Value` pair, +/// and multi-line values are wrapped in `<text>…</text>`. +pub(crate) fn parse_tag_value(input: &str) -> Result<Vec<SpdxFileEntry>, Error> { + let mut files = Vec::new(); + let mut current_name: Option<String> = None; + let mut current_license: Option<String> = None; + let mut current_copyright: Option<String> = None; + + let mut lines = input.lines(); + while let Some(line) = lines.next() { + let Some((tag, value)) = line.split_once(": ") else { + continue; + }; + + let value = resolve_multiline_value(value, &mut lines)?; + + match tag { + "FileName" => { + // A new file section begins. Flush the previous one if present. + if let Some(name) = current_name.take() { + files.push(build_file_entry( + name, + current_license.take(), + current_copyright.take(), + )?); + } + current_name = Some(value); + current_license = None; + current_copyright = None; + } + "LicenseConcluded" => current_license = Some(value), + "FileCopyrightText" => current_copyright = Some(value), + _ => {} + } + } + + // Flush the last file section. + if let Some(name) = current_name { + files.push(build_file_entry(name, current_license, current_copyright)?); + } + + Ok(files) +} + +/// Resolves a tag value that might span multiple lines using `<text>…</text>`. +fn resolve_multiline_value<'a>( + value: &str, + further_lines: &mut impl Iterator<Item = &'a str>, +) -> Result<String, Error> { + let Some(start) = value.strip_prefix("<text>") else { + return Ok(value.to_string()); + }; + + // The closing tag might be on the same line. + if let Some(content) = start.strip_suffix("</text>") { + return Ok(content.to_string()); + } + + let mut text = start.to_string(); + for line in further_lines.by_ref() { + if let Some(rest) = line.strip_suffix("</text>") { + text.push('\n'); + text.push_str(rest); + return Ok(text); + } + text.push('\n'); + text.push_str(line); + } + + anyhow::bail!("unexpected end of input inside <text> block") +} + +fn build_file_entry( + name: String, + concluded_license: Option<String>, + copyright_text: Option<String>, +) -> Result<SpdxFileEntry, Error> { + Ok(SpdxFileEntry { + name, + concluded_license: concluded_license + .ok_or_else(|| anyhow::anyhow!("file missing LicenseConcluded"))?, + copyright_text: copyright_text + .ok_or_else(|| anyhow::anyhow!("file missing FileCopyrightText"))?, + }) +} + +#[cfg(test)] +mod tests;
diff --git a/src/tools/collect-license-metadata/src/spdx/tests.rs b/src/tools/collect-license-metadata/src/spdx/tests.rs new file mode 100644 index 0000000..5b7cb41 --- /dev/null +++ b/src/tools/collect-license-metadata/src/spdx/tests.rs
@@ -0,0 +1,134 @@ +use super::*; + +// Clause 8.1 ("File name field") specifies that each file section begins with +// a `FileName` tag whose value is a relative path prefixed with "./". +// Clause 8.5 ("Concluded license") and 8.8 ("Copyright text") give the +// corresponding per-file fields. +// <https://spdx.github.io/spdx-spec/v2.3/file-information/> +#[test] +fn single_file_entry() { + let input = "\ +FileName: ./package/foo.c +LicenseConcluded: LGPL-2.0-only +FileCopyrightText: Copyright 2008-2010 John Smith"; + + let files = parse_tag_value(input).unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0].name, "./package/foo.c"); + assert_eq!(files[0].concluded_license, "LGPL-2.0-only"); + assert_eq!(files[0].copyright_text, "Copyright 2008-2010 John Smith"); +} + +// Clause 8.5 shows compound SPDX licence expressions as valid values for +// `LicenseConcluded`, e.g. "(LGPL-2.0-only OR LicenseRef-2)". +// <https://spdx.github.io/spdx-spec/v2.3/file-information/> +#[test] +fn compound_license_expression() { + let input = "\ +FileName: ./src/lib.rs +LicenseConcluded: (LGPL-2.0-only OR LicenseRef-2) +FileCopyrightText: Copyright Example Company"; + + let files = parse_tag_value(input).unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0].concluded_license, "(LGPL-2.0-only OR LicenseRef-2)"); +} + +// Clause 8.8 shows the copyright text wrapped in a single-line +// <text>...</text> block: e.g. +// `FileCopyrightText: <text>Copyright 2008-2010 John Smith</text>` +// <https://spdx.github.io/spdx-spec/v2.3/file-information/> +#[test] +fn single_line_text_block() { + let input = "\ +FileName: ./package/foo.c +LicenseConcluded: LGPL-2.0-only +FileCopyrightText: <text>Copyright 2008-2010 John Smith</text>"; + + let files = parse_tag_value(input).unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0].copyright_text, "Copyright 2008-2010 John Smith"); +} + +// Clause 6.10 ("Creator comment") demonstrates a multi-line <text>...</text> block. +// <https://spdx.github.io/spdx-spec/v2.3/document-creation-information/> +#[test] +fn multi_line_text_block() { + let input = "\ +FileName: ./package/foo.c +LicenseConcluded: MIT +FileCopyrightText: <text>Copyright 2008-2010 John Smith +Copyright 2019 Jane Doe</text>"; + + let files = parse_tag_value(input).unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0].copyright_text, "Copyright 2008-2010 John Smith\nCopyright 2019 Jane Doe"); +} + +// Clause 5 ("Composition of an SPDX document") states that a document may +// contain zero or many File Information sections. Each `FileName` tag starts +// a new section, so consecutive file blocks must be parsed independently. +// <https://spdx.github.io/spdx-spec/v2.3/composition-of-an-SPDX-document/> +#[test] +fn multiple_file_entries() { + let input = "\ +FileName: ./package/foo.c +LicenseConcluded: LGPL-2.0-only +FileCopyrightText: Copyright 2008-2010 John Smith +FileName: ./package/bar.c +LicenseConcluded: MIT +FileCopyrightText: Copyright Example Company"; + + let files = parse_tag_value(input).unwrap(); + assert_eq!(files.len(), 2); + + assert_eq!(files[0].name, "./package/foo.c"); + assert_eq!(files[0].concluded_license, "LGPL-2.0-only"); + assert_eq!(files[0].copyright_text, "Copyright 2008-2010 John Smith"); + + assert_eq!(files[1].name, "./package/bar.c"); + assert_eq!(files[1].concluded_license, "MIT"); + assert_eq!(files[1].copyright_text, "Copyright Example Company"); +} + +// A file section without a `LicenseConcluded` tag is malformed. +#[test] +fn missing_license_is_an_error() { + let input = "\ +FileName: ./package/foo.c +FileCopyrightText: Copyright 2008-2010 John Smith"; + + assert!(parse_tag_value(input).is_err()); +} + +// A file section without a `FileCopyrightText` tag is malformed. +#[test] +fn missing_copyright_is_an_error() { + let input = "\ +FileName: ./package/foo.c +LicenseConcluded: MIT"; + + assert!(parse_tag_value(input).is_err()); +} + +// A section with an unterminated <text> block (no closing </text>) is malformed. +#[test] +fn unterminated_text_block_is_an_error() { + let input = "\ +FileName: ./package/foo.c +LicenseConcluded: MIT +FileCopyrightText: <text>Copyright 2008-2010 John Smith"; + + assert!(parse_tag_value(input).is_err()); +} + +// A document with no `FileName` tags at all should produce an empty result. +#[test] +fn empty_document_returns_no_entries() { + let input = "\ +SPDXVersion: SPDX-2.3 +DataLicense: CC0-1.0"; + + let files = parse_tag_value(input).unwrap(); + assert!(files.is_empty()); +}