enriched-text

Rust parser for text/enriched as defined by rfc1523
git clone git://git.alexwennerberg.com/enriched-text
Log | Files | Refs | README | LICENSE

commit 27b1c9d42a29127cf091296a38b2d39b0944fa27
parent 015118c6db6eda84e385a5954f25dec3519afe0a
Author: Johann150 <johann.galle@protonmail.com>
Date:   Sun,  9 Jan 2022 08:38:26 -0800

Improve algorithm, make more in line with spec

Diffstat:
Msrc/lib.rs | 215+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
1 file changed, 134 insertions(+), 81 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs @@ -32,45 +32,70 @@ pub enum Tag { Nofill, Param, // Unrecognized - Unrecognized, + Unrecognized(String), } -struct EnrichedTextParser<'a> { +pub struct EnrichedTextParser<'a> { cursor: usize, data: &'a [u8], - in_tag: bool, + next: Option<Token>, } // escape only < impl<'a> EnrichedTextParser<'a> { - fn new(s: &'a [u8]) -> Self { + pub fn new(s: &'a [u8]) -> Self { return Self { cursor: 0, data: s, - in_tag: false, + next: None, }; } } -impl<'a> Iterator for EnrichedTextParser<'a> { - type Item = Token; - fn next(&mut self) -> Option<Token> { +impl<'a> EnrichedTextParser<'a> { + /// Function to try and parse a formatting command at the current cursor + /// position. Iff a command is found, returns `Some` with `Token::Start` or + /// `Token::End` respectively. If no command is found, returns `None`. + /// + /// The cursor is only advanced if a command was found. + /// ### Panics + /// Panics if the cursor is at EOF. + fn try_parse_command(&mut self) -> Option<Token> { let start = self.cursor; - if self.data.len() == self.cursor { - return None; - } - // awkard - if self.in_tag { - if self.data[self.cursor] == b'<' { - self.cursor += 1 - } - let start = self.cursor; - while self.cursor < self.data.len() && self.data[self.cursor] != b'>' { + if self.data[self.cursor] == b'<' { + self.cursor += 1; + // maybe a tag + let is_end = if self.data[self.cursor] == b'/' { self.cursor += 1; - } - let tag = &self.data[start..self.cursor]; - let t = match std::str::from_utf8(tag) { - Ok(p) => match p.to_lowercase().trim_start_matches("/") { + true + } else { + false + }; + + /* + each formatting command may be no longer than 60 chars + + subtract 1 from the length so there will be at least 1 more byte + at the end of the data for the '>' + */ + let end = usize::min(self.data.len() - 1, self.cursor + 60); + + let command_len = self.data[self.cursor..end] + .iter() + .take_while(|byte| matches!(byte, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')) + .count(); + + if command_len > 0 && self.data[self.cursor + command_len] == b'>' { + // indeed a formatting command + let command = + std::str::from_utf8(&self.data[self.cursor..self.cursor + command_len]) + // its safe to unwrap because we know the command consists of + // only ASCII alphanumerics from parsing it above + .unwrap() + // commands are case insensitive + .to_ascii_lowercase(); + + let command = match command.as_str() { "bold" => Tag::Bold, "italic" => Tag::Italic, "fixed" => Tag::Fixed, @@ -86,77 +111,105 @@ impl<'a> Iterator for EnrichedTextParser<'a> { "vertbatim" => Tag::Verbatim, "nofill" => Tag::Nofill, "param" => Tag::Param, - _ => Tag::Unrecognized, - }, - Err(_) => Tag::Unrecognized, - }; - self.in_tag = false; - self.cursor += 1; - if self.data[start] == b'/' { - return Some(Token::End(t)); - } else { - return Some(Token::Start(t)); + _ => Tag::Unrecognized(command), + }; + + // also count '>' + self.cursor += command_len + 1; + + return if is_end { + Some(Token::End(command)) + } else { + Some(Token::Start(command)) + }; } } - loop { - if self.cursor >= self.data.len() || self.in_tag { - let mut out = Vec::new(); - self.cursor -= 1; - let data = &self.data[start..=self.cursor]; - let mut crlf_count = 0; - let mut skip = false; - for (idx, window) in data.windows(2).enumerate() { - if skip { - skip = false; - continue; - } - if window[0] != b'\r' && crlf_count >= 1 { - if crlf_count >= 2 { - for _ in 0..crlf_count - 1 { - out.push(b'\r'); - out.push(b'\n'); - } - } else { - out.push(b' '); - } - crlf_count = 0; - } - if window[0] == b'\r' && window[1] == b'\n' { - crlf_count += 1; - skip = true; - } else if window[0] == b'<' && window[1] == b'<' { - continue; - } else { - out.push(window[0]); - } - if idx == data.len() - 2 { - out.push(window[1]); - } - } - self.cursor += 1; - if out.len() > 0 { - return Some(Token::Text(out)); + + // reset the cursor, maybe we fell through above + self.cursor = start; + None + } + + /// Processes text to handle CRLFs specially. + fn process_text(text: &'a [u8]) -> Vec<u8> { + let mut out = Vec::new(); + let mut lf_count = 0; + let mut i = 0; + + while i < text.len() { + if text[i] == b'\r' && text.get(i + 1) == Some(&b'\n') { + lf_count += 1; + i += 2; + continue; + } else if text[0] == b'\n' { + // also handle text with LF line endings gracefully + lf_count += 1; + i += 1; + continue; + } + // replace the line feeds appropriately + if lf_count > 1 { + for _ in 0..lf_count - 1 { + out.push(b'\r'); + out.push(b'\n'); } - return self.next(); + lf_count = 0; + } else if lf_count == 1 { + out.push(b' '); + lf_count = 0; } - if self.cursor + 1 < self.data.len() - && self.data[self.cursor] == b'<' - && self.data[self.cursor + 1] == b'<' - { - self.cursor += 2; + + if text[i] == b'<' && text.get(i + 1) == Some(&b'<') { + // skip escaped '<' + i += 1; } - if self.cursor < self.data.len() && self.data[self.cursor] == b'<' { - self.in_tag = true; + out.push(text[i]); + i += 1; + } + + out + } +} + +impl<'a> Iterator for EnrichedTextParser<'a> { + type Item = Token; + + fn next(&mut self) -> Option<Token> { + // return any command tokens we already parsed + if let Some(token) = self.next.take() { + return Some(token); + } + + let start = self.cursor; + while self.cursor < self.data.len() { + let maybe_end = self.cursor; + if let Some(command) = self.try_parse_command() { + // A command starts here, but first we have to return the text + // if there is any. + return Some(if start != maybe_end { + // there is some text, store the parsed command away for + // the next invocation + self.next = Some(command); + Token::Text(Self::process_text(&self.data[start..maybe_end])) + } else { + command + }); } else { self.cursor += 1; } } + + // must have hit EOF + if self.cursor != start { + Some(Token::Text(Self::process_text(&self.data[start..]))) + } else { + None + } } } #[cfg(test)] mod tests { - // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; #[test] @@ -212,8 +265,8 @@ mod tests { p.collect::<Vec<Token>>(), vec![ Token::Text(b"Stay".to_vec()), - Token::End(Tag::Unrecognized), - Token::Start(Tag::Unrecognized), + Token::End(Tag::Unrecognized("broken".into())), + Token::Start(Tag::Unrecognized("cool".into())), Token::Text(b"cool".to_vec()), ] )