blob: 6f4ce4415f04a35ee92fa05fc9529e4b712fdd63 [file] [log] [blame] [edit]
use std::path::PathBuf;
use std::sync::LazyLock;
use std::{fs, process};
use anyhow::Result;
use clap::Parser;
use ignore::Walk;
use regex::Regex;
#[derive(Parser)]
struct Cli {
/// File or directory to check
path: PathBuf,
#[arg(long)]
/// Modify files that do not comply
overwrite: bool,
/// Applies to lines that are to be split
#[arg(long, default_value_t = 100)]
line_length_limit: usize,
}
static REGEX_IGNORE_END: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(\.|\?|;|!|,|\-)$").unwrap());
static REGEX_IGNORE_LINK_TARGETS: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^\[.+\]: ").unwrap());
static REGEX_SPLIT: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"([^\.\d\-\*]\.|[^r]\?|!)\s").unwrap());
// list elements, numbered (1.) or not (- and *)
static REGEX_LIST_ENTRY: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^\s*(\d\.|\-|\*|\d\))\s+").unwrap());
fn main() -> Result<()> {
let cli = Cli::parse();
let mut compliant = Vec::new();
let mut not_compliant = Vec::new();
let mut made_compliant = Vec::new();
for result in Walk::new(cli.path) {
let entry = result?;
if entry.file_type().expect("no stdin").is_dir() {
continue;
}
let path = entry.into_path();
if let Some(extension) = path.extension() {
if extension != "md" {
continue;
}
let old = fs::read_to_string(&path)?;
let new = lengthen_lines(&comply(&old), cli.line_length_limit);
if new == old {
compliant.push(path.clone());
} else if cli.overwrite {
fs::write(&path, new)?;
made_compliant.push(path.clone());
} else {
not_compliant.push(path.clone());
}
}
}
if !compliant.is_empty() {
display("compliant", &compliant);
}
if !made_compliant.is_empty() {
display("made compliant", &made_compliant);
}
if !not_compliant.is_empty() {
display("not compliant", &not_compliant);
process::exit(1);
}
Ok(())
}
fn display(header: &str, paths: &[PathBuf]) {
println!("{header}:");
for element in paths {
println!("- {}", element.display());
}
}
fn ignore(line: &str, in_code_block: bool) -> bool {
in_code_block
|| line.to_lowercase().contains("e.g.")
|| line.to_lowercase().contains("n.b.")
|| line.contains(" etc.")
|| line.contains("i.e.")
|| line.contains("et. al")
|| line.contains('|')
|| line.trim_start().starts_with('>')
|| line.starts_with('#')
|| line.trim().is_empty()
|| REGEX_IGNORE_LINK_TARGETS.is_match(line)
}
fn comply(content: &str) -> String {
let content: Vec<_> = content.lines().map(std::borrow::ToOwned::to_owned).collect();
let mut new_content = content.clone();
let mut new_n = 0;
let mut in_code_block = false;
for (n, line) in content.into_iter().enumerate() {
if n != 0 {
new_n += 1;
}
if line.trim_start().starts_with("```") {
in_code_block = !in_code_block;
continue;
}
if ignore(&line, in_code_block) {
continue;
}
if REGEX_SPLIT.is_match(&line) {
let indent = if let Some(regex_match) = REGEX_LIST_ENTRY.find(&line) {
regex_match.len()
} else {
line.find(|ch: char| !ch.is_whitespace()).unwrap()
};
let mut newly_split_lines = line.split_inclusive(&*REGEX_SPLIT);
let first = newly_split_lines.next().unwrap().trim_end().to_owned();
let mut remaining: Vec<_> = newly_split_lines
.map(|portion| format!("{:indent$}{}", "", portion.trim_end()))
.collect();
let mut new_lines = Vec::new();
new_lines.push(first);
new_lines.append(&mut remaining);
new_content.splice(new_n..=new_n, new_lines.clone());
new_n += new_lines.len() - 1;
}
}
new_content.join("\n") + "\n"
}
fn lengthen_lines(content: &str, limit: usize) -> String {
let content: Vec<_> = content.lines().map(std::borrow::ToOwned::to_owned).collect();
let mut new_content = content.clone();
let mut new_n = 0;
let mut in_code_block = false;
let mut in_html_div = false;
let mut skip_next = false;
for (n, line) in content.iter().enumerate() {
if skip_next {
skip_next = false;
continue;
}
if n != 0 {
new_n += 1;
}
if line.trim_start().starts_with("```") {
in_code_block = !in_code_block;
continue;
}
if line.trim_start().starts_with("<div") {
in_html_div = true;
continue;
}
if line.trim_start().starts_with("</div") {
in_html_div = false;
continue;
}
if in_html_div {
continue;
}
if ignore(line, in_code_block) || REGEX_SPLIT.is_match(line) {
continue;
}
let Some(next_line) = content.get(n + 1) else {
continue;
};
if next_line.trim_start().starts_with("```") {
continue;
}
if ignore(next_line, in_code_block)
|| REGEX_LIST_ENTRY.is_match(next_line)
|| REGEX_IGNORE_END.is_match(line)
{
continue;
}
if line.len() + next_line.len() < limit {
new_content[new_n] = format!("{line} {}", next_line.trim_start());
new_content.remove(new_n + 1);
skip_next = true;
}
}
new_content.join("\n") + "\n"
}
#[test]
fn test_sembr() {
let original = "
# some. heading
must! be. split?
1. ignore a dot after number. but no further
ignore | tables
ignore e.g. and
ignore i.e. and
ignore etc. and
ignore E.g. too
- list. entry
* list. entry
1) list. entry
```
some code. block
```
sentence with *italics* should not be ignored. truly.
git log main.. compiler
foo. bar. baz
o? whatever
r? @reviewer
r? @reviewer
";
let expected = "
# some. heading
must!
be.
split?
1. ignore a dot after number.
but no further
ignore | tables
ignore e.g. and
ignore i.e. and
ignore etc. and
ignore E.g. too
- list.
entry
* list.
entry
1) list.
entry
```
some code. block
```
sentence with *italics* should not be ignored.
truly.
git log main.. compiler
foo.
bar.
baz
o?
whatever
r? @reviewer
r? @reviewer
";
assert_eq!(expected, comply(original));
}
#[test]
fn test_lengthen_lines() {
let original = "\
do not split
short sentences
<div class='warning'>
a bit of text inside
</div>
preserve next line
1. one
preserve next line
- two
preserve next line
* three
do not mess with code block chars
```
leave the
text alone
```
handle the
indented well
[a target]: https://example.com
[another target]: https://example.com
";
let expected = "\
do not split short sentences
<div class='warning'>
a bit of text inside
</div>
preserve next line
1. one
preserve next line
- two
preserve next line
* three
do not mess with code block chars
```
leave the
text alone
```
handle the indented well
[a target]: https://example.com
[another target]: https://example.com
";
assert_eq!(expected, lengthen_lines(original, 50));
}