Skip to content

Commit

Permalink
feat: css_sort (#190)
Browse files Browse the repository at this point in the history
  • Loading branch information
EdJoPaTo committed Jun 4, 2024
1 parent b7d0a88 commit f208dcd
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- new editor: `css_sort`
- new editor: `debug_files`

### Fixed
Expand Down
65 changes: 65 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,71 @@ editors:
- css_select: h1 > a
```

#### `css_sort`

Sort elements matching to the given [CSS Selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors).
Other elements not matching are kept.
Elements below different parents are sorted independently.

Basic example:

```html
<div><p>C</p><p>B</p></div>
<div><p>D</p><p>A</p></div>
```

with `p` as the selector will sort into this:

```html
<div><p>B</p><p>C</p></div>
<div><p>A</p><p>D</p></div>
```

Examples:

```yaml
editors:
# Sort all articles
- css_sort:
selector: article
```

The above example sorts by the whole element ([`outerHTML`](https://developer.mozilla.org/en-US/docs/Web/API/Element/outerHTML)).
In order to sort by something specific for a given HTML element, editors can be used.

```yaml
editors:
# Sort articles by their heading
- css_sort:
selector: article
sort_by: # the specified editors are applied to every selected HTML element independently
- css_select: h2
```

This might still sort in surprising ways as things like attributes are still included (`<h2 class="a">Z</h2>` is sorted before `<h2 class="z">A</h2>`).
Therefore, editors like [`html_textify`](#html_textify) or [`html_sanitize`](#html_sanitize) are likely a good idea to be used in `sort_by`.

Tip: [`debug_files`](#debug_files) can help you understand what is happening. But don't forget to remove it after you are done testing:

```yaml
editors:
- css_sort:
selector: article
sort_by:
- css_select: h2
- html_sanitize
- debug_files: /tmp/website-stalker/
```

You can also reverse the sorting:

```yaml
editors:
- css_sort:
selector: article
reverse: true
```

#### `debug_files`

This editor passes its input through without modifying it.
Expand Down
191 changes: 191 additions & 0 deletions src/editor/css_sort.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
use std::collections::HashMap;

use scraper::{ElementRef, Html, Selector};
use serde::Deserialize;
use url::Url;

use super::Editor;
use crate::logger;

#[derive(Debug, Clone, Deserialize)]
pub struct CssSort {
#[serde(deserialize_with = "super::deserialize_selector")]
pub selector: Selector,

#[serde(default)]
pub reverse: bool,

#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub sort_by: Vec<Editor>,
}

impl CssSort {
pub fn apply(&self, url: &Url, html: &str) -> String {
let mut html = Html::parse_document(html);
let selected = html.select(&self.selector).collect::<Vec<_>>();

let mut grouped_by_parent: HashMap<_, Vec<ElementRef>> = HashMap::new();
for element in selected {
if let Some(key) = element.parent().map(|parent| parent.id()) {
grouped_by_parent.entry(key).or_default().push(element);
}
}

if grouped_by_parent.is_empty() {
logger::warn(&format!("css_sort selector selected nothing to sort {url}"));
}

// Get the order of the elements as ids
// This removes the reference to html allowing to take mut references later on
let sorted = grouped_by_parent
.into_iter()
.map(|(parent, mut elements)| {
elements.sort_by_cached_key(|element| self.get_sort_key_from_element(url, element));
if self.reverse {
elements.reverse();
}
let elements = elements
.iter()
.map(|element| element.id())
.collect::<Vec<_>>();
(parent, elements)
})
.collect::<HashMap<_, _>>();

for (parent, sorted) in sorted {
for id in &sorted {
html.tree.get_mut(*id).unwrap().detach();
}

// Insert them at the beginning of the parents children
// This destroyes the order with the other elements in there but its way simpler to do for now
let mut parent_mut = html.tree.get_mut(parent).unwrap();
for id in sorted.into_iter().rev() {
parent_mut.prepend_id(id);
}
}

html.html()
}

fn get_sort_key_from_element(&self, url: &Url, element: &ElementRef) -> String {
let content = super::Content {
extension: Some("html"),
text: element.html(),
};
Editor::apply_many(&self.sort_by, url, content).map_or_else(
|error| {
logger::error(&format!("css_sort sort_by failed {error}"));
String::new()
},
|content| content.text,
)
}
}

#[cfg(test)]
mod tests {
use super::*;

#[track_caller]
fn case(css_sort: &CssSort, input: &str, expected: &str) {
const PREFIX: &str = "<html><head></head><body>";
const SUFFIX: &str = "</body></html>";

let url = Url::parse("https://edjopato.de/").unwrap();
let html = css_sort.apply(&url, input);

assert!(html.starts_with(PREFIX));
assert!(html.ends_with(SUFFIX));
let end_index = html.len() - SUFFIX.len();
let html = html.get(PREFIX.len()..end_index).unwrap();

assert_eq!(html, expected);
}

#[test]
fn simple_example() {
let input = "<p>A</p><p>C</p><p>B</p>";
let expected = "<p>A</p><p>B</p><p>C</p>";
let sort_by = CssSort {
selector: Selector::parse("p").unwrap(),
sort_by: Vec::new(),
reverse: false,
};
case(&sort_by, input, expected);
}

#[test]
fn reverse() {
let input = "<p>A</p><p>C</p><p>B</p>";
let expected = "<p>C</p><p>B</p><p>A</p>";
let sort_by = CssSort {
selector: Selector::parse("p").unwrap(),
sort_by: Vec::new(),
reverse: true,
};
case(&sort_by, input, expected);
}

#[test]
fn sort_by() {
let input = r#"<article><h3>A</h3><a id="Y">Bla</a></article><article><h3>B</h3><a id="X">Bla</a></article>"#;
let expected = r#"<article><h3>B</h3><a id="X">Bla</a></article><article><h3>A</h3><a id="Y">Bla</a></article>"#;
let sort_by = CssSort {
selector: Selector::parse("article").unwrap(),
sort_by: vec![Editor::CssSelect(Selector::parse("a").unwrap())],
reverse: false,
};
case(&sort_by, input, expected);
}

#[test]
fn sort_by_same_key_keeps_order() {
let input = r#"<article><h3>C</h3><a id="X">Bla</a></article><article><h3>A</h3><a id="X">Bla</a></article>"#;
let expected = r#"<article><h3>C</h3><a id="X">Bla</a></article><article><h3>A</h3><a id="X">Bla</a></article>"#;
let sort_by = CssSort {
selector: Selector::parse("article").unwrap(),
sort_by: vec![Editor::CssSelect(Selector::parse("a").unwrap())],
reverse: false,
};
case(&sort_by, input, expected);
}

#[test]
fn sorting_toplevel_keeps_children_unsorted() {
let input = "<div><p>D</p><p>A</p></div><div><p>C</p><p>B</p></div>";
let expected = "<div><p>C</p><p>B</p></div><div><p>D</p><p>A</p></div>";
let sort_by = CssSort {
selector: Selector::parse("div").unwrap(),
sort_by: Vec::new(),
reverse: false,
};
case(&sort_by, input, expected);
}

#[test]
fn sorting_bottomlevel_keeps_parents_unsorted() {
let input = "<div><p>D</p><p>A</p></div><div><p>C</p><p>B</p></div>";
let expected = "<div><p>A</p><p>D</p></div><div><p>B</p><p>C</p></div>";
let sort_by = CssSort {
selector: Selector::parse("p").unwrap(),
sort_by: Vec::new(),
reverse: false,
};
case(&sort_by, input, expected);
}

/// Documents current sorting order when other elements are there.
/// Needs to be adapted when sorting order is improved.
#[test]
fn sort_with_other_elements() {
let input = "<div>1</div><p>A</p><img><p>B</p>";
let expected = "<p>A</p><p>B</p><div>1</div><img>";
let sort_by = CssSort {
selector: Selector::parse("p").unwrap(),
sort_by: Vec::new(),
reverse: false,
};
case(&sort_by, input, expected);
}
}
7 changes: 7 additions & 0 deletions src/editor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use url::Url;

pub mod css_remove;
pub mod css_selector;
pub mod css_sort;
pub mod debug_files;
pub mod html_markdown;
pub mod html_pretty;
Expand All @@ -26,6 +27,7 @@ pub struct Content {
pub enum Editor {
CssRemove(#[serde(deserialize_with = "deserialize_selector")] scraper::Selector),
CssSelect(#[serde(deserialize_with = "deserialize_selector")] scraper::Selector),
CssSort(css_sort::CssSort),
DebugFiles(PathBuf),
HtmlMarkdownify,
HtmlPrettify,
Expand All @@ -42,6 +44,7 @@ impl Editor {
match self {
Self::CssRemove(_) => "css_remove",
Self::CssSelect(_) => "css_select",
Self::CssSort(_) => "css_sort",
Self::DebugFiles(_) => "debug_files",
Self::HtmlMarkdownify => "html_markdownify",
Self::HtmlPrettify => "html_prettify",
Expand All @@ -64,6 +67,10 @@ impl Editor {
extension: Some("html"),
text: css_selector::apply(selector, &input.text)?,
}),
Self::CssSort(sort) => Ok(Content {
extension: Some("html"),
text: sort.apply(url, &input.text),
}),
Self::DebugFiles(path) => debug_files::debug_files(path, input),
Self::HtmlMarkdownify => Ok(Content {
extension: Some("md"),
Expand Down

0 comments on commit f208dcd

Please sign in to comment.