nanohtml2text

Lightweight html to text converter in Rust
git clone git://git.alexwennerberg.com/nanohtml2text
Log | Files | Refs | README | LICENSE

commit b65548e2ca3ef706571ce0c4e28042e0df7448be
parent 8ad91b23d6caddd1c5b627f8060c99a71ce9cea9
Author: alex wennerberg <alex@alexwennerberg.com>
Date:   Mon,  3 Jan 2022 09:57:11 -0800

rewriting more code

Diffstat:
Msrc/main.rs | 46++++++++++++++++++++++++++++++++++++++++------
1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/src/main.rs b/src/main.rs @@ -3,17 +3,24 @@ fn main() { println!("Hello, world!"); } -fn write_space(s: &mut String) {} +fn parse_html_entity(ent_name: &str) {} -fn html2text(input: &str) -> String { - let in_len = input.len(); +fn write_space(s: &mut String) { + let b = s.as_bytes(); + if b.len() > 0 && b[b.len() - 1] != b' ' { + s.push(' '); + } +} + +fn html2text(html: &str) -> String { + let in_len = html.len(); let mut tag_start = 0; let mut in_ent = false; let mut bad_tag_stack_depth = 0; let mut should_output = true; let mut can_print_new_line = false; let mut out_buf = String::new(); - for (i, r) in input.chars().enumerate() { + for (i, r) in html.chars().enumerate() { if in_len > 0 && i == in_len - 1 { can_print_new_line = false } @@ -27,8 +34,35 @@ fn html2text(input: &str) -> String { continue; } else if r == '&' && should_output { let mut ent_name = String::new(); - in_ent = false; - // parse the entity name, max 10 chars + let mut is_ent = false; + let mut chars = 10; + for er in html[i + 1..].chars() { + if er == ';' { + is_ent = true; + break; + } else { + ent_name.push(er); + } + chars += 1; + if chars == 10 { + break; + } + } + if is_ent { + // parseHTMLentity TODO + } + } else if r == '<' { + // start of tag + tag_start = i + 1; + should_output = false; + continue; + } else if r == '>' { // end of tag + // TODO + } + + if should_output && bad_tag_stack_depth == 0 && !in_ent { + can_print_new_line = true; + out_buf.push(r); } } out_buf