Rollup merge of #154823 - jakubadamw:spdx-rs-replacement, r=Mark-Simulacrum

Replace the spdx-rs dependency with a minimal in-tree SPDX tag-value parser

The spdx-rs crate [is no longer maintained](https://github.com/doubleopen-project/spdx-rs/pulls) and is behind on its own dependency updates. It is currently used in [the collect-license-metadata tool](https://github.com/rust-lang/rust/tree/main/src/tools/collect-license-metadata), employing a single function therefrom: `spdx_rs::parsers::spdx_from_tag_value`, which parses the output of the `reuse` tool to extract file names, licences and copyright text.

This PR replaces the use of said function with a small minimal parser that handles just the subset of the SPDX tag-value format that is needed: `Tag: Value` line pairs and multi-line `<text>...</text>` blocks.

Coincidentally, this gets rid of the last transitive dependency on syn v1.
diff --git a/Cargo.lock b/Cargo.lock
index 3b56dd6..5a9764b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -199,7 +199,7 @@
  "rustc-hash 2.1.1",
  "serde",
  "serde_derive",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -396,7 +396,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -573,7 +573,6 @@
 dependencies = [
  "iana-time-zone",
  "num-traits",
- "serde",
  "windows-link 0.2.1",
 ]
 
@@ -635,10 +634,10 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -768,7 +767,6 @@
  "serde",
  "serde_json",
  "similar",
- "spdx-rs",
 ]
 
 [[package]]
@@ -804,7 +802,7 @@
  "nom",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1047,7 +1045,7 @@
  "proc-macro2",
  "quote",
  "scratch",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1061,7 +1059,7 @@
  "indexmap",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1079,7 +1077,7 @@
  "indexmap",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1103,7 +1101,7 @@
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1114,7 +1112,7 @@
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1146,7 +1144,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1167,7 +1165,7 @@
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1177,7 +1175,7 @@
 checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1189,7 +1187,7 @@
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1264,7 +1262,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -1688,12 +1686,6 @@
 
 [[package]]
 name = "heck"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
-
-[[package]]
-name = "heck"
 version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
@@ -2092,7 +2084,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -2389,7 +2381,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -2534,7 +2526,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -2915,7 +2907,7 @@
  "pest_meta",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -3125,7 +3117,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -3316,7 +3308,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -3409,7 +3401,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -4125,7 +4117,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -4271,7 +4263,7 @@
  "fluent-syntax",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
  "synstructure",
 ]
 
@@ -4852,7 +4844,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
  "synstructure",
 ]
 
@@ -4952,7 +4944,7 @@
  "proc-macro2",
  "quote",
  "serde",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -5069,7 +5061,7 @@
  "proc-macro2",
  "quote",
  "serde_derive_internals",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -5155,7 +5147,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -5166,7 +5158,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -5300,35 +5292,6 @@
 ]
 
 [[package]]
-name = "spdx-expression"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53d7ac03c67c572d85049d6db815e20a4a19b41b3d5cca732ac582342021ad77"
-dependencies = [
- "nom",
- "serde",
- "thiserror 1.0.69",
- "tracing",
-]
-
-[[package]]
-name = "spdx-rs"
-version = "0.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "990870190ec8d8c64ba66e4a6746243d6e57d99353991e0e6092334833f429b1"
-dependencies = [
- "chrono",
- "log",
- "nom",
- "serde",
- "spdx-expression",
- "strum",
- "strum_macros",
- "thiserror 1.0.69",
- "uuid",
-]
-
-[[package]]
 name = "stable_deref_trait"
 version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -5394,36 +5357,6 @@
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
 [[package]]
-name = "strum"
-version = "0.24.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
-
-[[package]]
-name = "strum_macros"
-version = "0.24.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
-dependencies = [
- "heck 0.4.1",
- "proc-macro2",
- "quote",
- "rustversion",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "syn"
-version = "1.0.109"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-
-[[package]]
 name = "syn"
 version = "2.0.110"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -5442,7 +5375,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -5579,7 +5512,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -5590,7 +5523,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -5832,7 +5765,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -6028,7 +5961,7 @@
 dependencies = [
  "proc-macro-hack",
  "quote",
- "syn 2.0.110",
+ "syn",
  "unic-langid-impl",
 ]
 
@@ -6266,7 +6199,7 @@
  "bumpalo",
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
  "wasm-bindgen-shared",
 ]
 
@@ -6523,7 +6456,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -6534,7 +6467,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -6904,7 +6837,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
  "synstructure",
 ]
 
@@ -6925,7 +6858,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
@@ -6945,7 +6878,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
  "synstructure",
 ]
 
@@ -6980,7 +6913,7 @@
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.110",
+ "syn",
 ]
 
 [[package]]
diff --git a/src/tools/collect-license-metadata/Cargo.toml b/src/tools/collect-license-metadata/Cargo.toml
index f84da24..ae41d2f 100644
--- a/src/tools/collect-license-metadata/Cargo.toml
+++ b/src/tools/collect-license-metadata/Cargo.toml
@@ -10,4 +10,3 @@
 serde = { version = "1.0.147", features = ["derive"] }
 serde_json = "1.0.85"
 similar = "2.7.0"
-spdx-rs = "0.5.1"
diff --git a/src/tools/collect-license-metadata/src/main.rs b/src/tools/collect-license-metadata/src/main.rs
index 4e218ea..156871b 100644
--- a/src/tools/collect-license-metadata/src/main.rs
+++ b/src/tools/collect-license-metadata/src/main.rs
@@ -1,6 +1,7 @@
 mod licenses;
 mod path_tree;
 mod reuse;
+mod spdx;
 
 use std::path::PathBuf;
 
diff --git a/src/tools/collect-license-metadata/src/reuse.rs b/src/tools/collect-license-metadata/src/reuse.rs
index dbe4678..6bc4145 100644
--- a/src/tools/collect-license-metadata/src/reuse.rs
+++ b/src/tools/collect-license-metadata/src/reuse.rs
@@ -15,18 +15,15 @@ pub(crate) fn collect(
     let raw = &obtain_spdx_document(reuse_exe)?;
     println!("finished gathering the license information from REUSE in {:.2?}", start.elapsed());
 
-    let document = spdx_rs::parsers::spdx_from_tag_value(&raw)?;
+    let files = crate::spdx::parse_tag_value(raw)?;
 
     let mut result = Vec::new();
-    for file in document.file_information {
-        let concluded_license = file.concluded_license.expect("File should have licence info");
-        let copyright_text = file.copyright_text.expect("File should have copyright text");
+    for file in files {
         let license = interner.intern(License {
-            spdx: concluded_license.to_string(),
-            copyright: copyright_text.split('\n').map(|s| s.into()).collect(),
+            spdx: file.concluded_license,
+            copyright: file.copyright_text.split('\n').map(|s| s.into()).collect(),
         });
-
-        result.push((file.file_name.into(), license));
+        result.push((file.name.into(), license));
     }
 
     Ok(result)
diff --git a/src/tools/collect-license-metadata/src/spdx/mod.rs b/src/tools/collect-license-metadata/src/spdx/mod.rs
new file mode 100644
index 0000000..a94f2bc
--- /dev/null
+++ b/src/tools/collect-license-metadata/src/spdx/mod.rs
@@ -0,0 +1,102 @@
+use anyhow::Error;
+
+/// A single file entry extracted from an SPDX tag-value document.
+pub(crate) struct SpdxFileEntry {
+    pub(crate) name: String,
+    pub(crate) concluded_license: String,
+    pub(crate) copyright_text: String,
+}
+
+/// Parses an SPDX tag-value document and extracts file information.
+///
+/// This is a minimal parser that only extracts the fields we need
+/// (FileName, LicenseConcluded, FileCopyrightText) rather than the full model.
+/// The format is specified by the SPDX specification:
+/// each line is a `Tag: Value` pair,
+/// and multi-line values are wrapped in `<text>…</text>`.
+pub(crate) fn parse_tag_value(input: &str) -> Result<Vec<SpdxFileEntry>, Error> {
+    let mut files = Vec::new();
+    let mut current_name: Option<String> = None;
+    let mut current_license: Option<String> = None;
+    let mut current_copyright: Option<String> = None;
+
+    let mut lines = input.lines();
+    while let Some(line) = lines.next() {
+        let Some((tag, value)) = line.split_once(": ") else {
+            continue;
+        };
+
+        let value = resolve_multiline_value(value, &mut lines)?;
+
+        match tag {
+            "FileName" => {
+                // A new file section begins. Flush the previous one if present.
+                if let Some(name) = current_name.take() {
+                    files.push(build_file_entry(
+                        name,
+                        current_license.take(),
+                        current_copyright.take(),
+                    )?);
+                }
+                current_name = Some(value);
+                current_license = None;
+                current_copyright = None;
+            }
+            "LicenseConcluded" => current_license = Some(value),
+            "FileCopyrightText" => current_copyright = Some(value),
+            _ => {}
+        }
+    }
+
+    // Flush the last file section.
+    if let Some(name) = current_name {
+        files.push(build_file_entry(name, current_license, current_copyright)?);
+    }
+
+    Ok(files)
+}
+
+/// Resolves a tag value that might span multiple lines using `<text>…</text>`.
+fn resolve_multiline_value<'a>(
+    value: &str,
+    further_lines: &mut impl Iterator<Item = &'a str>,
+) -> Result<String, Error> {
+    let Some(start) = value.strip_prefix("<text>") else {
+        return Ok(value.to_string());
+    };
+
+    // The closing tag might be on the same line.
+    if let Some(content) = start.strip_suffix("</text>") {
+        return Ok(content.to_string());
+    }
+
+    let mut text = start.to_string();
+    for line in further_lines.by_ref() {
+        if let Some(rest) = line.strip_suffix("</text>") {
+            text.push('\n');
+            text.push_str(rest);
+            return Ok(text);
+        }
+        text.push('\n');
+        text.push_str(line);
+    }
+
+    anyhow::bail!("unexpected end of input inside <text> block")
+}
+
+fn build_file_entry(
+    name: String,
+    concluded_license: Option<String>,
+    copyright_text: Option<String>,
+) -> Result<SpdxFileEntry, Error> {
+    Ok(SpdxFileEntry {
+        name,
+        concluded_license: concluded_license
+            .ok_or_else(|| anyhow::anyhow!("file missing LicenseConcluded"))?,
+        copyright_text: copyright_text
+            .ok_or_else(|| anyhow::anyhow!("file missing FileCopyrightText"))?,
+    })
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/src/tools/collect-license-metadata/src/spdx/tests.rs b/src/tools/collect-license-metadata/src/spdx/tests.rs
new file mode 100644
index 0000000..5b7cb41
--- /dev/null
+++ b/src/tools/collect-license-metadata/src/spdx/tests.rs
@@ -0,0 +1,134 @@
+use super::*;
+
+// Clause 8.1 ("File name field") specifies that each file section begins with
+// a `FileName` tag whose value is a relative path prefixed with "./".
+// Clause 8.5 ("Concluded license") and 8.8 ("Copyright text") give the
+// corresponding per-file fields.
+// <https://spdx.github.io/spdx-spec/v2.3/file-information/>
+#[test]
+fn single_file_entry() {
+    let input = "\
+FileName: ./package/foo.c
+LicenseConcluded: LGPL-2.0-only
+FileCopyrightText: Copyright 2008-2010 John Smith";
+
+    let files = parse_tag_value(input).unwrap();
+    assert_eq!(files.len(), 1);
+    assert_eq!(files[0].name, "./package/foo.c");
+    assert_eq!(files[0].concluded_license, "LGPL-2.0-only");
+    assert_eq!(files[0].copyright_text, "Copyright 2008-2010 John Smith");
+}
+
+// Clause 8.5 shows compound SPDX licence expressions as valid values for
+// `LicenseConcluded`, e.g. "(LGPL-2.0-only OR LicenseRef-2)".
+// <https://spdx.github.io/spdx-spec/v2.3/file-information/>
+#[test]
+fn compound_license_expression() {
+    let input = "\
+FileName: ./src/lib.rs
+LicenseConcluded: (LGPL-2.0-only OR LicenseRef-2)
+FileCopyrightText: Copyright Example Company";
+
+    let files = parse_tag_value(input).unwrap();
+    assert_eq!(files.len(), 1);
+    assert_eq!(files[0].concluded_license, "(LGPL-2.0-only OR LicenseRef-2)");
+}
+
+// Clause 8.8 shows the copyright text wrapped in a single-line
+// <text>...</text> block: e.g.
+// `FileCopyrightText: <text>Copyright 2008-2010 John Smith</text>`
+// <https://spdx.github.io/spdx-spec/v2.3/file-information/>
+#[test]
+fn single_line_text_block() {
+    let input = "\
+FileName: ./package/foo.c
+LicenseConcluded: LGPL-2.0-only
+FileCopyrightText: <text>Copyright 2008-2010 John Smith</text>";
+
+    let files = parse_tag_value(input).unwrap();
+    assert_eq!(files.len(), 1);
+    assert_eq!(files[0].copyright_text, "Copyright 2008-2010 John Smith");
+}
+
+// Clause 6.10 ("Creator comment") demonstrates a multi-line <text>...</text> block.
+// <https://spdx.github.io/spdx-spec/v2.3/document-creation-information/>
+#[test]
+fn multi_line_text_block() {
+    let input = "\
+FileName: ./package/foo.c
+LicenseConcluded: MIT
+FileCopyrightText: <text>Copyright 2008-2010 John Smith
+Copyright 2019 Jane Doe</text>";
+
+    let files = parse_tag_value(input).unwrap();
+    assert_eq!(files.len(), 1);
+    assert_eq!(files[0].copyright_text, "Copyright 2008-2010 John Smith\nCopyright 2019 Jane Doe");
+}
+
+// Clause 5 ("Composition of an SPDX document") states that a document may
+// contain zero or many File Information sections. Each `FileName` tag starts
+// a new section, so consecutive file blocks must be parsed independently.
+// <https://spdx.github.io/spdx-spec/v2.3/composition-of-an-SPDX-document/>
+#[test]
+fn multiple_file_entries() {
+    let input = "\
+FileName: ./package/foo.c
+LicenseConcluded: LGPL-2.0-only
+FileCopyrightText: Copyright 2008-2010 John Smith
+FileName: ./package/bar.c
+LicenseConcluded: MIT
+FileCopyrightText: Copyright Example Company";
+
+    let files = parse_tag_value(input).unwrap();
+    assert_eq!(files.len(), 2);
+
+    assert_eq!(files[0].name, "./package/foo.c");
+    assert_eq!(files[0].concluded_license, "LGPL-2.0-only");
+    assert_eq!(files[0].copyright_text, "Copyright 2008-2010 John Smith");
+
+    assert_eq!(files[1].name, "./package/bar.c");
+    assert_eq!(files[1].concluded_license, "MIT");
+    assert_eq!(files[1].copyright_text, "Copyright Example Company");
+}
+
+// A file section without a `LicenseConcluded` tag is malformed.
+#[test]
+fn missing_license_is_an_error() {
+    let input = "\
+FileName: ./package/foo.c
+FileCopyrightText: Copyright 2008-2010 John Smith";
+
+    assert!(parse_tag_value(input).is_err());
+}
+
+// A file section without a `FileCopyrightText` tag is malformed.
+#[test]
+fn missing_copyright_is_an_error() {
+    let input = "\
+FileName: ./package/foo.c
+LicenseConcluded: MIT";
+
+    assert!(parse_tag_value(input).is_err());
+}
+
+// A section with an unterminated <text> block (no closing </text>) is malformed.
+#[test]
+fn unterminated_text_block_is_an_error() {
+    let input = "\
+FileName: ./package/foo.c
+LicenseConcluded: MIT
+FileCopyrightText: <text>Copyright 2008-2010 John Smith";
+
+    assert!(parse_tag_value(input).is_err());
+}
+
+// A document with no `FileName` tags at all should produce an empty result.
+#[test]
+fn empty_document_returns_no_entries() {
+    let input = "\
+SPDXVersion: SPDX-2.3
+DataLicense: CC0-1.0";
+
+    let files = parse_tag_value(input).unwrap();
+    assert!(files.is_empty());
+}