nanohtml2text

Lightweight html to text converter in Rust
git clone git://git.alexwennerberg.com/nanohtml2text
Log | Files | Refs | LICENSE

commit 00a9b04d903ba7c6e04dc0b055f32763c781bd11
parent 6e3fd37e736d69aa8afbd657b5a8e5e12527a934
Author: Johann150 <johann.galle@protonmail.com>
Date:   Wed, 12 Jan 2022 22:39:45 +0100

make several functions more concise

Make more use of try operator and standard library functions.
Do not operate on bytes directly.

Diffstat:
Msrc/lib.rs | 144++++++++++++++++++++++++++++++++++++++-----------------------------------------
1 file changed, 69 insertions(+), 75 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs @@ -14,25 +14,29 @@ const BAD_TAGS: [&str; 4] = ["head", "script", "style", "a"]; // awkward fn parse_link(l: &str) -> Option<&str> { - if l.starts_with("a") { - let s: Vec<&str> = l.split("href=").collect(); - if s.len() > 1 { - if s[1] != "" { - if s[1].as_bytes()[0] == b'\'' { - let end = s[1][1..].bytes().position(|c| c == b'\''); - if let Some(p) = end { - return Some(&s[1][1..=p]); - } - } else if s[1].as_bytes()[0] == b'"' { - let end = s[1][1..].bytes().position(|c| c == b'"'); - if let Some(p) = end { - return Some(&s[1][1..=p]); - } - } - } + let href_value = l + .strip_prefix('a')? + // check for the href and then discard everything before it + .split_once("href")? + .1 + // there might be whitespace between 'href' and '=' + .trim_start() + // check for and then discard the equal sign + .strip_prefix('=')? + // remove whitespace after the equal sign + .trim_start(); + + // find quoted string + match href_value.chars().next()? { + start @ '\'' | start @ '"' => { + let (end, _) = href_value + .char_indices() + .skip(1) + .find(|(_, c)| *c == start)?; + Some(&href_value[1..end]) } + _ => None, } - None } fn is_bad_tag(t: &str) -> bool { @@ -45,16 +49,18 @@ fn is_bad_tag(t: &str) -> bool { // replacing regex fn is_header(h: &str) -> bool { - let mut b = h.as_bytes(); - if b.len() == 3 && b[0] == b'/' { - b = &b[1..] - } - if b.len() == 2 && b[0] == b'h' { - if b'1' <= b[1] && b[1] <= b'6' { - return true; - } - } - false + // optionally remove leading slash + h.strip_prefix('/') + .unwrap_or(h) + // remove leading h + .strip_prefix('h') + // there should only be one more char + .filter(|h| h.len() == 1) + // if that all worked, take the char + .and_then(|h| h.chars().next()) + // if we have the char, check if its 1 to 6 + // or false if we dont have the char + .map_or(false, |c| matches!(c, '1'..='6')) } fn parse_html_entity(ent_name: &str) -> Option<char> { @@ -62,61 +68,49 @@ fn parse_html_entity(ent_name: &str) -> Option<char> { if d.is_some() { return d; } - // rewriting without regex - let lower = ent_name.to_lowercase(); - if lower.starts_with("#") && lower.len() > 1 { - let parsed; - if lower.as_bytes()[1] == b'x' && lower.len() > 2 { - parsed = u32::from_str_radix(&lower[2..], 16).ok(); - } else { - parsed = u32::from_str_radix(&lower[1..], 10).ok(); - } - return parsed.and_then(|n| { - if n == 9 || n == 10 || n == 13 || n > 32 { - return char::from_u32(n); - } - return None; - }); - } - None + let num = ent_name.strip_prefix("#")?; + if num.chars().next()? == 'x' { + u32::from_str_radix(&num[1..].to_lowercase(), 16) + } else { + // remaining string may be empty, but that will generate an Err(Empty) + u32::from_str_radix(num, 10) + } + .ok() + .filter(|n| !matches!(n, 9 | 10 | 13 | 32)) + .and_then(|n| char::from_u32(n)) } fn html_entitities_to_text(s: &str) -> String { let mut out = String::new(); - let mut in_ent = false; - for (i, r) in s.char_indices() { - if r == ';' && in_ent { - in_ent = false; - continue; - } else if r == '&' { - let mut ent_name = String::new(); - let mut is_ent = false; - let mut chars = 0; - for er in s[i + 1..].chars() { - if er == ';' { - is_ent = true; - break; - } else { - ent_name.push(er); - } - chars += 1; - if chars == 10 { - break; - } - } - if is_ent { - if let Some(ent) = parse_html_entity(&ent_name) { - out.push(ent); - in_ent = true; - continue; - } - } - } - if !in_ent { - out.push(r); + + // except for the first part, every part will have started with an ampersand + // thus the start of the remaining parts is a HTML entity + let mut parts = s.split('&'); + /* + skip first part. if the string started with an ampersand, the first part + will be an empty string + + if the string was empty, the first part will also be an empty string so its + safe to unwrap + */ + out.push_str(parts.next().unwrap()); + + for part in parts { + let end = part + // entity can be terminated by semicolon or whitespace + .find(|c: char| c.is_whitespace() || c == ';') + // entity can also terminated by end of string or start of + // another entity + .unwrap_or_else(|| part.len()); + if let Some(entity) = parse_html_entity(&part[..end]) { + out.push(entity); + out.push_str(&part[end..]); + } else { + out.push_str(part) } } + out }