nanohtml2text

Lightweight html to text converter in Rust
git clone git://git.alexwennerberg.com/nanohtml2text
Log | Files | Refs | README | LICENSE

commit 10c2a0cbff320ad04fede44255be26a40679759d
parent fd7383cdba3c2cfac3245774e7d7cbdcacc3f645
Author: Ayrat Badykov <ayratin555@gmail.com>
Date:   Sat, 23 Apr 2022 11:41:02 +0300

fix invalid parsing of invalid html entities

Diffstat:
MCargo.toml | 2+-
Msrc/lib.rs | 16+++++++++-------
2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "nanohtml2text" -version = "0.1.2" +version = "0.1.3" edition = "2018" readme = "README.txt" license = "MIT" diff --git a/src/lib.rs b/src/lib.rs @@ -50,13 +50,14 @@ fn html_entitities_to_text(s: &str) -> String { if let Some(entity) = parse_html_entity(&part[..end]) { out.push(entity); // get byte length of the char we did `find` above - let skip = &part[end..] - .chars() - .next() - // we know there is another character so its safe to unwrap - .unwrap() - .len_utf8(); - out.push_str(&part[end + skip..]); + let real_end = if let Some(next) = &part[end..].chars().next() { + end + next.len_utf8() + } else { + // invalid html entity that doesn't end with `;` + end + }; + + out.push_str(&part[real_end..]); } else { out.push('&'); out.push_str(part); @@ -339,5 +340,6 @@ mod tests { "<aa >hello</aa>" to "hello", ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello", + invalid_html_entity_without_semicolon: "&hellip" to "…", } }