blob: cbbed3a5c4170528d97457cf238050a878a01e38 [file] [log] [blame]
//! Support for translating links to the standard library.
use crate::{Diagnostics, bug, warn_or_err};
use anyhow::{Result, bail};
use mdbook::BookItem;
use mdbook::book::{Book, Chapter};
use once_cell::sync::Lazy;
use pulldown_cmark::{BrokenLink, CowStr, Event, LinkType, Options, Parser, Tag};
use regex::Regex;
use std::collections::HashMap;
use std::fmt::Write as _;
use std::fs;
use std::ops::Range;
use std::path::PathBuf;
use std::process::Command;
use tempfile::TempDir;
/// The Regex used to extract the std links from the HTML generated by rustdoc.
static STD_LINK_EXTRACT_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"<li>LINK: (.*)</li>"#).unwrap());
/// The Regex used to extract the URL from an HTML link.
static ANCHOR_URL: Lazy<Regex> = Lazy::new(|| Regex::new("<a href=\"([^\"]+)\"").unwrap());
/// Regex for a markdown inline link, like `[foo](bar)`.
static MD_LINK_INLINE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?s)(\[.+\])(\(.+\))").unwrap());
/// Regex for a markdown reference link, like `[foo][bar]`.
static MD_LINK_REFERENCE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?s)(\[.+\])(\[.*\])").unwrap());
/// Regex for a markdown shortcut link, like `[foo]`.
static MD_LINK_SHORTCUT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?s)(\[.+\])").unwrap());
/// Converts links to the standard library to the online documentation in a
/// fashion similar to rustdoc intra-doc links.
pub fn std_links(book: &mut Book, diag: &mut Diagnostics) {
// Collect all links in all chapters.
let mut chapter_links = HashMap::new();
for item in book.iter() {
let BookItem::Chapter(ch) = item else {
continue;
};
if ch.is_draft_chapter() {
continue;
}
let key = ch.source_path.as_ref().unwrap();
chapter_links.insert(key, collect_markdown_links(&ch, diag));
}
// Write a Rust source file to use with rustdoc to generate intra-doc links.
let tmp = TempDir::with_prefix("mdbook-spec-").unwrap();
if let Err(e) = run_rustdoc(&tmp, &chapter_links, diag) {
warn_or_err!(diag, "{e:?}");
return;
}
// Extract the links from the generated html.
let generated = fs::read_to_string(tmp.path().join("doc/a/index.html"))
.expect("index.html failed to generate");
let mut urls: Vec<_> = STD_LINK_EXTRACT_RE
.captures_iter(&generated)
.map(|cap| cap.get(1).unwrap().as_str())
.collect();
let mut urls = &mut urls[..];
let expected_len: usize = chapter_links.values().map(|l| l.len()).sum();
if urls.len() != expected_len {
bug!(
"expected rustdoc to generate {} links, but found {}",
expected_len,
urls.len(),
);
}
// Unflatten the urls list so that it is split back by chapter.
let mut ch_urls: HashMap<&PathBuf, Vec<_>> = HashMap::new();
for (ch_path, links) in &chapter_links {
let xs;
(xs, urls) = urls.split_at_mut(links.len());
ch_urls.insert(ch_path, xs.into());
}
// Do this in two passes to deal with lifetimes.
let mut ch_contents = HashMap::new();
for item in book.iter() {
let BookItem::Chapter(ch) = item else {
continue;
};
if ch.is_draft_chapter() {
continue;
}
let key = ch.source_path.as_ref().unwrap();
// Create a list of replacements to make in the raw markdown to point to the new url.
let replacements = compute_replacements(&ch, &chapter_links[key], &ch_urls[key], diag);
let mut new_contents = ch.content.clone();
for (md_link, url, range) in replacements {
// Convert links to be relative so that links work offline and
// with the linkchecker.
let url = relative_url(url, ch);
// Note that this may orphan reference link definitions. This should
// probably remove them, but pulldown_cmark doesn't give the span for
// the reference definition.
new_contents.replace_range(range, &format!("{md_link}({url})"));
}
ch_contents.insert(key.clone(), new_contents);
}
// Replace the content with the new content.
book.for_each_mut(|item| {
let BookItem::Chapter(ch) = item else {
return;
};
if ch.is_draft_chapter() {
return;
}
let key = ch.source_path.as_ref().unwrap();
let content = ch_contents.remove(key).unwrap();
ch.content = content;
});
}
#[derive(Debug)]
struct Link<'a> {
link_type: LinkType,
/// Where the link is going to, for example `std::ffi::OsString`.
dest_url: CowStr<'a>,
/// The span in the original markdown where the link is located.
///
/// Note that this is the post-processed markdown (such as having rules
/// expanded), not the markdown on the disk.
///
/// Note that during translation, all links will be converted to inline
/// links. That means that for reference-style links, the link reference
/// definition will end up being ignored in the final markdown. For
/// example, a link like ``[`OsString`]`` with a definition
/// ``[`OsString`]: std::ffi::OsString`` will convert the link to
/// ``[`OsString`](https://doc.rust-lang.org/std/ffi/struct.OsString.html)`.
range: Range<usize>,
}
/// Collects all markdown links that look like they might be standard library links.
fn collect_markdown_links<'a>(chapter: &'a Chapter, diag: &mut Diagnostics) -> Vec<Link<'a>> {
let mut opts = Options::empty();
opts.insert(Options::ENABLE_TABLES);
opts.insert(Options::ENABLE_FOOTNOTES);
opts.insert(Options::ENABLE_STRIKETHROUGH);
opts.insert(Options::ENABLE_TASKLISTS);
opts.insert(Options::ENABLE_HEADING_ATTRIBUTES);
opts.insert(Options::ENABLE_SMART_PUNCTUATION);
let mut broken_links = Vec::new();
let mut links = Vec::new();
// Broken links are collected so that you can write something like
// `[std::option::Option]` which in pulldown_cmark's eyes is a broken
// link. However, that is the normal syntax for rustdoc.
let broken_link = |broken_link: BrokenLink<'_>| {
broken_links.push(Link {
link_type: broken_link.link_type,
// Necessary due to lifetime issues.
dest_url: CowStr::Boxed(broken_link.reference.into_string().into()),
range: broken_link.span.clone(),
});
None
};
let parser = Parser::new_with_broken_link_callback(&chapter.content, opts, Some(broken_link))
.into_offset_iter();
for (event, range) in parser {
match event {
Event::Start(Tag::Link {
link_type,
dest_url,
title,
id: _,
}) => {
// Only collect links that are for the standard library.
if matches!(link_type, LinkType::Autolink | LinkType::Email) {
continue;
}
if dest_url.starts_with("http")
|| dest_url.contains(".md")
|| dest_url.contains(".html")
|| dest_url.starts_with('#')
{
continue;
}
if !title.is_empty() {
warn_or_err!(
diag,
"titles in links are not supported\n\
Link {dest_url} has title `{title}` found in chapter {} ({:?})",
chapter.name,
chapter.source_path.as_ref().unwrap()
);
}
links.push(Link {
link_type,
dest_url,
range: range.clone(),
});
}
_ => {}
}
}
links.extend(broken_links);
links
}
/// Generates links using rustdoc.
///
/// This takes the given links and creates a temporary Rust source file
/// containing those links within doc-comments, and then runs rustdoc to
/// generate intra-doc links on them.
///
/// The output will be in the given `tmp` directory.
fn run_rustdoc(
tmp: &TempDir,
chapter_links: &HashMap<&PathBuf, Vec<Link<'_>>>,
diag: &mut Diagnostics,
) -> Result<()> {
let src_path = tmp.path().join("a.rs");
// Allow redundant since there could some in-scope things that are
// technically not necessary, but we don't care about (like
// [`Option`](std::option::Option)).
let mut src = format!(
"#![{}(rustdoc::broken_intra_doc_links)]\n\
#![allow(rustdoc::redundant_explicit_links)]\n",
if diag.deny_warnings { "deny" } else { "warn" }
);
// This uses a list to make easy to pull the links out of the generated HTML.
for (_ch_path, links) in chapter_links {
for link in links {
match link.link_type {
LinkType::Inline
| LinkType::Reference
| LinkType::Collapsed
| LinkType::Shortcut => {
writeln!(src, "//! - LINK: [{}]", link.dest_url).unwrap();
}
LinkType::ReferenceUnknown
| LinkType::CollapsedUnknown
| LinkType::ShortcutUnknown => {
// These should only happen due to broken link replacements.
bug!("unexpected link type unknown {link:?}");
}
LinkType::Autolink | LinkType::Email => {
bug!("link type should have been filtered {link:?}");
}
}
}
}
// Put some common things into scope so that links to them work.
writeln!(
src,
"extern crate alloc;\n\
extern crate proc_macro;\n\
extern crate test;\n"
)
.unwrap();
fs::write(&src_path, &src).unwrap();
let rustdoc = std::env::var("RUSTDOC").unwrap_or_else(|_| "rustdoc".into());
let output = Command::new(rustdoc)
.arg("--edition=2024")
.arg(&src_path)
.current_dir(tmp.path())
.output()
.expect("rustdoc installed");
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
bail!(
"failed to extract std links ({:?})\n{stderr}",
output.status
);
}
Ok(())
}
static DOC_URL: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^https://doc.rust-lang.org/(?:nightly|beta|stable|dev|1\.[0-9]+\.[0-9]+)").unwrap()
});
/// Converts a URL to doc.rust-lang.org to be relative.
fn relative_url(url: &str, chapter: &Chapter) -> String {
// Set SPEC_RELATIVE=0 to disable this, which can be useful for working locally.
if std::env::var("SPEC_RELATIVE").as_deref() != Ok("0") {
let Some(url_start) = DOC_URL.shortest_match(url) else {
bug!("expected rustdoc URL to start with {DOC_URL:?}, got {url}");
};
let url_path = &url[url_start..];
let num_dots = chapter.path.as_ref().unwrap().components().count();
let dots = vec![".."; num_dots].join("/");
format!("{dots}{url_path}")
} else {
url.to_string()
}
}
/// Computes the replacements to make in the markdown content.
///
/// Returns a `Vec` of `(md_link, url, range)` where:
///
/// - `md_link` is the markdown link string to show to the user (like `[foo]`).
/// - `url` is the URL to the standard library.
/// - `range` is the range in the original markdown to replace with the new link.
fn compute_replacements<'a>(
chapter: &'a Chapter,
links: &[Link<'_>],
urls: &[&'a str],
diag: &mut Diagnostics,
) -> Vec<(&'a str, &'a str, Range<usize>)> {
let mut replacements = Vec::new();
for (url, link) in urls.iter().zip(links) {
let Some(cap) = ANCHOR_URL.captures(url) else {
let line = super::line_from_range(&chapter.content, &link.range);
warn_or_err!(
diag,
"broken markdown link found in {}\n\
Line is: {line}\n\
Link to `{}` could not be resolved by rustdoc to a known URL (result was `{}`).\n",
chapter.source_path.as_ref().unwrap().display(),
link.dest_url,
url
);
continue;
};
let url = cap.get(1).unwrap().as_str();
let md_link = &chapter.content[link.range.clone()];
let range = link.range.clone();
let add_link = |re: &Regex| {
let Some(cap) = re.captures(md_link) else {
bug!(
"expected link `{md_link}` of type {:?} to match regex {re}",
link.link_type
);
};
let md_link = cap.get(1).unwrap().as_str();
replacements.push((md_link, url, range));
};
match link.link_type {
LinkType::Inline => {
add_link(&MD_LINK_INLINE);
}
LinkType::Reference | LinkType::Collapsed => {
add_link(&MD_LINK_REFERENCE);
}
LinkType::Shortcut => {
add_link(&MD_LINK_SHORTCUT);
}
_ => {
bug!("unexpected link type: {link:#?}");
}
}
}
// Sort and reverse (so that it can replace bottom-up so ranges don't shift).
replacements.sort_by(|a, b| b.2.clone().partial_cmp(a.2.clone()).unwrap());
replacements
}