nanohtml2text

Lightweight html to text converter in Rust
git clone git://git.alexwennerberg.com/nanohtml2text
Log | Files | Refs | LICENSE

commit db479b931e80b098e15607ae0b4aac7cb36408e9
parent 0b0d3502438a4978aa6bcf56ee981672194de66e
Author: Johann150 <johann.galle@protonmail.com>
Date:   Thu, 13 Jan 2022 02:05:35 +0100

improve API documentation

Diffstat:
Msrc/lib.rs | 13+++++++++++++
1 file changed, 13 insertions(+), 0 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs @@ -154,6 +154,19 @@ fn handle_tag(s: &str) -> (String, usize) { _ => (String::new(), tag.len() + 1), } } + +/// Convert some HTML to plain text. Only some simple HTML tags are handled: +/// - `a` tags are transformed to their href attribute value +/// - paragraph, linebreak, heading, list, and list item tags insert different +/// amounts of line breaks. +/// - HTML comments as well as `head`, `script` and `style` are completely +/// discarded, including their content +/// - unknown tags are skipped, but their content is printed +/// +/// HTML named entities will be replaced with the respecive Unicode code point, +/// and whitespace will be collapsed as is usual in HTML. +/// +/// The resulting string will have CRLF line endings. pub fn html2text(html: &str) -> String { // collapse spaces let html = html.split_whitespace().collect::<Vec<_>>().join(" ");