enriched-text

Rust parser for text/enriched as defined by rfc1523
git clone git://git.alexwennerberg.com/enriched-text
Log | Files | Refs | README

commit e65eee6a7ca08e9563bafa3c7d0879e54b1c1578
parent 63fdfbdbbbc6d80c3fa9dbd8558b6ce57e365322
Author: alex wennerberg <alex@alexwennerberg.com>
Date:   Sat,  8 Jan 2022 20:12:20 -0800

WIP

Diffstat:
Msrc/lib.rs | 93+++++++++++++++++++++++++++++++++++++------------------------------------------
1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs @@ -2,11 +2,9 @@ //! https://datatracker.ietf.org/doc/html/rfc1523 #[derive(Debug, Eq, PartialEq)] -pub enum Token<'a> { +pub enum Token { // returns bytes in order to be encoding-agnostic - Text(&'a [u8]), - CRLF, - LT, + Text(Vec<u8>), End(Tag), Start(Tag), } @@ -59,23 +57,13 @@ impl<'a> EnrichedTextParser<'a> { } impl<'a> Iterator for EnrichedTextParser<'a> { - type Item = Token<'a>; - fn next(&mut self) -> Option<Token<'a>> { + type Item = Token; + fn next(&mut self) -> Option<Token> { let start = self.cursor; if self.data.len() == self.cursor { return None; } // awkard - if self.CRLF { - self.CRLF = false; - self.cursor += 1; - return Some(Token::CRLF); - } - if self.LT { - self.LT = false; - self.cursor += 1; - return Some(Token::LT); - } if self.in_tag { while self.data[self.cursor] != b'>' && self.cursor < self.data.len() { self.cursor += 1; @@ -111,32 +99,45 @@ impl<'a> Iterator for EnrichedTextParser<'a> { } } loop { - if self.cursor == self.data.len() { - return Some(Token::Text(&self.data[start..self.cursor])); - } - if self.data[self.cursor] == b'\r' - && self.cursor < self.data.len() + 1 - && self.data[self.cursor + 1] == b'\n' - { - self.CRLF = true; + if self.cursor == self.data.len() || self.cursor > start { + println!("{}", self.cursor); + let mut out = Vec::new(); + let data = &self.data[start..self.cursor]; + let mut crlf_count = 0; + let mut skip = true; + for (idx, window) in data.windows(2).enumerate() { + if skip { + skip = false; + continue; + } + if crlf_count >= 1 { + if crlf_count >= 2 { + for _ in 1..crlf_count - 1 { + out.push(b'\r'); + out.push(b'\n'); + } + } else { + out.push(b' '); + } + crlf_count = 0; + } + if window[0] == b'\r' && window[1] == b'\n' { + crlf_count += 1; + } else if window[0] == b'<' && window[1] == b'<' { + skip = true; + } else { + out.push(window[0]); + } + } + return Some(Token::Text(out)); } if self.data[self.cursor] == b'<' { if self.cursor < self.data.len() - 1 && self.data[self.cursor + 1] == b'<' { - self.LT = true; } else if self.data[self.cursor] == b'<' { self.in_tag = true; } - self.cursor += 1; - if self.cursor - 1 > start { - return Some(Token::Text(&self.data[start..self.cursor - 1])); - } else { - return self.next(); - } } self.cursor += 1; - if self.CRLF || self.LT { - return Some(Token::Text(&self.data[start..self.cursor - 1])); - } } } } @@ -150,7 +151,7 @@ mod tests { fn test_simple() { let data = b"Hello world!"; let p = EnrichedTextParser::new(data); - assert_eq!(p.collect::<Vec<Token>>(), vec![Token::Text(data)]); + assert_eq!(p.collect::<Vec<Token>>(), vec![Token::Text(data.to_vec())]); } #[test] @@ -160,9 +161,9 @@ mod tests { assert_eq!( p.collect::<Vec<Token>>(), vec![ - Token::Text(b"Hello "), + Token::Text(b"Hello ".to_vec()), Token::Start(Tag::Bold), - Token::Text(b"world!"), + Token::Text(b"world!".to_vec()), Token::End(Tag::Bold) ] ); @@ -174,11 +175,11 @@ mod tests { assert_eq!( p.collect::<Vec<Token>>(), vec![ - Token::Text(b"Hello "), + Token::Text(b"Hello ".to_vec()), Token::Start(Tag::Indent), - Token::Text(b"beautiful "), + Token::Text(b"beautiful ".to_vec()), Token::Start(Tag::Bold), - Token::Text(b"world!"), + Token::Text(b"world!".to_vec()), Token::End(Tag::Bold), Token::End(Tag::Indent) ] @@ -191,9 +192,9 @@ mod tests { assert_eq!( p.collect::<Vec<Token>>(), vec![ - Token::Text(b"Stay "), + Token::Text(b"Stay ".to_vec()), Token::Start(Tag::Unrecognized), - Token::Text(b"cool"), + Token::Text(b"cool".to_vec()), ] ) } @@ -203,13 +204,7 @@ mod tests { let p = EnrichedTextParser::new(data); assert_eq!( p.collect::<Vec<Token>>(), - vec![ - Token::Text(b"Hello"), - Token::CRLF, - Token::Text(b"World"), - Token::LT, - Token::Text(b"Universe") - ] + vec![Token::Text(b"Hello World<Universe".to_vec()),] ); } }