nanohtml2text

Lightweight html to text converter in Rust
git clone git://git.alexwennerberg.com/nanohtml2text
Log | Files | Refs | README | LICENSE

commit 600ebbe445074f298a4b694c3496d1c8bd2172b9
parent 77989212d3e56eca8931b3baad3cb497a57bcfa3
Author: Ayrat Badykov <ayratin555@gmail.com>
Date:   Sat, 24 Sep 2022 14:05:25 +0300

parse href along with its content

Diffstat:
MCargo.lock | 2+-
Msrc/lib.rs | 34++++++++++++++++++++++++----------
2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock @@ -4,4 +4,4 @@ version = 3 [[package]] name = "nanohtml2text" -version = "0.1.2" +version = "0.1.3" diff --git a/src/lib.rs b/src/lib.rs @@ -121,12 +121,26 @@ fn handle_tag(s: &str) -> (String, usize) { // messed up from one uppercase symbol becoming two lowercase // symbols or something like that let more = more.to_ascii_lowercase(); - let end = more - .find("</a") + + let end_without_closing = more.find("</a"); + let content = end_without_closing.map(|i| more[0..i].trim()); + + let end = end_without_closing .map(|i| i + 3) .and_then(|end_tag| more[end_tag..].find('>').map(|i| end_tag + i + 1)) .unwrap_or_else(|| more.len()); - (href.unwrap_or_default(), tag.len() + 1 + end) + + let link = match (content, href) { + (Some(content_value), Some(href_value)) => { + let cleaned_content_value = html2text(content_value); + format!("{} ({})", cleaned_content_value, href_value) + } + (None, Some(href_value)) => href_value, + (Some(content_value), None) => content_value.to_string(), + (None, None) => "".to_string(), + }; + + (link, tag.len() + 1 + end) } "br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1), "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" @@ -225,25 +239,25 @@ mod tests { // links link: "click <a href=\"test\">here</a>" - to "click test", + to "click here (test)", links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" - to "click test", + to "click here (test)", link_entities_in_url: "click <a href=\"ents/&apos;x&apos;\">here</a>" - to "click ents/'x'", + to "click here (ents/'x')", link_javascript: "click <a href=\"javascript:void(0)\">here</a>" - to "click ", + to "click here", link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" - to "click test", + to "click here or here (test)", link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" - to "click http://bit.ly/2n4wXRs", + to "click news (http://bit.ly/2n4wXRs)", link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" - to "/wiki/yet#English, /wiki/not_yet#English", + to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)", // inlines ignore_inline: "strong <strong>text</strong>"