nanohtml2text

Lightweight html to text converter in Rust
git clone git://git.alexwennerberg.com/nanohtml2text
Log | Files | Refs | LICENSE

commit 39d3f5d6a5457ed5efd95bbee1fe61f9ed120fcb
parent 0f96420abb8e47c548d6d28e5e8b7576e41fc588
Author: alex wennerberg <alex@alexwennerberg.com>
Date:   Thu,  6 Jan 2022 08:39:51 -0800

Add link support

Diffstat:
Msrc/main.rs | 90+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 88 insertions(+), 2 deletions(-)

diff --git a/src/main.rs b/src/main.rs @@ -19,8 +19,31 @@ fn decode_named_entity(entity: &str) -> Option<char> { const BAD_TAGS: [&str; 4] = ["head", "script", "style", "a"]; +// awkward +fn parse_link(l: &str) -> Option<&str> { + if l.starts_with("a") { + let s: Vec<&str> = l.split("href=").collect(); + if s.len() > 1 { + if s[1] != "" { + if s[1].as_bytes()[0] == b'\'' { + let end = s[1][1..].chars().position(|c| c == '\''); + if let Some(p) = end { + return Some(&s[1][1..=p]); + } + } else if s[1].as_bytes()[0] == b'"' { + let end = s[1][1..].chars().position(|c| c == '"'); + if let Some(p) = end { + return Some(&s[1][1..=p]); + } + } + } + } + } + None +} + fn is_bad_tag(t: &str) -> bool { - let t = t.trim_end(); + let t = t.split_whitespace().next().unwrap(); if BAD_TAGS.contains(&t) { return true; } @@ -66,6 +89,44 @@ fn parse_html_entity(ent_name: &str) -> Option<char> { None } +fn html_entitities_to_text(s: &str) -> String { + let mut out = String::new(); + let mut in_ent = false; + for (i, r) in s.chars().enumerate() { + if r == ';' && in_ent { + in_ent = false; + continue; + } else if r == '&' { + let mut ent_name = String::new(); + let mut is_ent = false; + let mut chars = 0; + for er in s[i + 1..].chars() { + if er == ';' { + is_ent = true; + break; + } else { + ent_name.push(er); + } + chars += 1; + if chars == 10 { + break; + } + } + if is_ent { + if let Some(ent) = parse_html_entity(&ent_name) { + out.push(ent); + in_ent = true; + continue; + } + } + } + if !in_ent { + out.push(r); + } + } + out +} + fn write_space(s: &mut String) { let b = s.as_bytes(); if b.len() > 0 && b[b.len() - 1] != b' ' { @@ -145,7 +206,12 @@ fn html2text(html: &str) -> String { can_print_new_line = false; } else if is_bad_tag(&tag_name_lower) { bad_tag_stack_depth += 1; - // TODO parse link + // parse link + if let Some(link) = parse_link(tag) { + if !link.contains("javascript:") { + out_buf.push_str(&html_entitities_to_text(link)); + } + } } else if tag_name_lower.len() > 0 && tag_name_lower.starts_with("/") && is_bad_tag(&tag_name_lower) @@ -168,6 +234,26 @@ mod tests { use super::*; const cases: &[(&str, &str)] = &[ ("blah", "blah"), + // links + ("<div></div>", ""), + ("<div>simple text</div>", "simple text"), + ("click <a href=\"test\">here</a>", "click test"), + ("click <a class=\"x\" href=\"test\">here</a>", "click test"), + ( + "click <a href=\"ents/&apos;x&apos;\">here</a>", + "click ents/'x'", + ), + ("click <a href=\"javascript:void(0)\">here</a>", "click "), + ( + "click <a href=\"test\"><span>here</span> or here</a>", + "click test", + ), + ( + "click <a href=\"http://bit.ly/2n4wXRs\">news</a>", + "click http://bit.ly/2n4wXRs", + ), + // ("<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>", "/wiki/yet#English, /wiki/not_yet#English"), + // inlines ("strong <strong>text</strong>", "strong text"), ("some <div id=\"a\" class=\"b\">div</div>", "some div"),