enriched-text

Rust parser for text/enriched as defined by rfc1523
git clone git://git.alexwennerberg.com/enriched-text
Log | Files | Refs | README | LICENSE

commit 63fdfbdbbbc6d80c3fa9dbd8558b6ce57e365322
Author: alex wennerberg <alex@alexwennerberg.com>
Date:   Sat,  8 Jan 2022 19:34:40 -0800

Initial commit

Diffstat:
A.gitignore | 1+
ACargo.lock | 7+++++++
ACargo.toml | 4++++
AREADME | 6++++++
Asrc/lib.rs | 215+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 233 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "enriched-text" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml @@ -0,0 +1,4 @@ +[package] +name = "enriched-text" +version = "0.1.0" +edition = "2018" diff --git a/README b/README @@ -0,0 +1,6 @@ +text/enriched parser +=================== + +Parser for text/enriched, as defined in https://datatracker.ietf.org/doc/html/rfc1523 + +In "alpha" diff --git a/src/lib.rs b/src/lib.rs @@ -0,0 +1,215 @@ +//! A parser for the text/enriched format, as described in +//! https://datatracker.ietf.org/doc/html/rfc1523 + +#[derive(Debug, Eq, PartialEq)] +pub enum Token<'a> { + // returns bytes in order to be encoding-agnostic + Text(&'a [u8]), + CRLF, + LT, + End(Tag), + Start(Tag), +} + +#[derive(Debug, Eq, PartialEq)] +pub enum Tag { + // Font-alteration commands + Bold, + Italic, + Fixed, + Smaller, + Bigger, + Underline, + // Justification commands + // Note to library users: inner tag takes precedence + Center, + FlushLeft, + FlushRight, + // Indendation commands + Indent, + IndentRight, + // Miscellaneous commands + Excerpt, + Verbatim, + Nofill, + Param, + // Unrecognized + Unrecognized, +} + +struct EnrichedTextParser<'a> { + cursor: usize, + data: &'a [u8], + in_tag: bool, + CRLF: bool, + LT: bool, +} + +// escape only < +impl<'a> EnrichedTextParser<'a> { + fn new(s: &'a [u8]) -> Self { + return Self { + cursor: 0, + data: s, + in_tag: false, + CRLF: false, + LT: false, + }; + } +} + +impl<'a> Iterator for EnrichedTextParser<'a> { + type Item = Token<'a>; + fn next(&mut self) -> Option<Token<'a>> { + let start = self.cursor; + if self.data.len() == self.cursor { + return None; + } + // awkard + if self.CRLF { + self.CRLF = false; + self.cursor += 1; + return Some(Token::CRLF); + } + if self.LT { + self.LT = false; + self.cursor += 1; + return Some(Token::LT); + } + if self.in_tag { + while self.data[self.cursor] != b'>' && self.cursor < self.data.len() { + self.cursor += 1; + } + let tag = &self.data[start..self.cursor]; + let t = match std::str::from_utf8(tag) { + Ok(p) => match p.to_lowercase().trim_start_matches("/") { + "bold" => Tag::Bold, + "italic" => Tag::Italic, + "fixed" => Tag::Fixed, + "smaller" => Tag::Smaller, + "bigger" => Tag::Bigger, + "underline" => Tag::Underline, + "center" => Tag::Center, + "flushleft" => Tag::FlushLeft, + "flushright" => Tag::FlushRight, + "indent" => Tag::Indent, + "indentright" => Tag::IndentRight, + "excerpt" => Tag::Excerpt, + "vertbatim" => Tag::Verbatim, + "nofill" => Tag::Nofill, + "param" => Tag::Param, + _ => Tag::Unrecognized, + }, + Err(_) => Tag::Unrecognized, + }; + self.in_tag = false; + self.cursor += 1; + if self.data[start] == b'/' { + return Some(Token::End(t)); + } else { + return Some(Token::Start(t)); + } + } + loop { + if self.cursor == self.data.len() { + return Some(Token::Text(&self.data[start..self.cursor])); + } + if self.data[self.cursor] == b'\r' + && self.cursor < self.data.len() + 1 + && self.data[self.cursor + 1] == b'\n' + { + self.CRLF = true; + } + if self.data[self.cursor] == b'<' { + if self.cursor < self.data.len() - 1 && self.data[self.cursor + 1] == b'<' { + self.LT = true; + } else if self.data[self.cursor] == b'<' { + self.in_tag = true; + } + self.cursor += 1; + if self.cursor - 1 > start { + return Some(Token::Text(&self.data[start..self.cursor - 1])); + } else { + return self.next(); + } + } + self.cursor += 1; + if self.CRLF || self.LT { + return Some(Token::Text(&self.data[start..self.cursor - 1])); + } + } + } +} + +#[cfg(test)] +mod tests { + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + + #[test] + fn test_simple() { + let data = b"Hello world!"; + let p = EnrichedTextParser::new(data); + assert_eq!(p.collect::<Vec<Token>>(), vec![Token::Text(data)]); + } + + #[test] + fn test_tag() { + let data = b"Hello <Bold>world!</Bold>"; + let p = EnrichedTextParser::new(data); + assert_eq!( + p.collect::<Vec<Token>>(), + vec![ + Token::Text(b"Hello "), + Token::Start(Tag::Bold), + Token::Text(b"world!"), + Token::End(Tag::Bold) + ] + ); + } + #[test] + fn test_nested_tag() { + let data = b"Hello <indent>beautiful <bold>world!</bold></indent>"; + let p = EnrichedTextParser::new(data); + assert_eq!( + p.collect::<Vec<Token>>(), + vec![ + Token::Text(b"Hello "), + Token::Start(Tag::Indent), + Token::Text(b"beautiful "), + Token::Start(Tag::Bold), + Token::Text(b"world!"), + Token::End(Tag::Bold), + Token::End(Tag::Indent) + ] + ); + } + #[test] + fn test_incomplete_tag() { + let data = b"Stay <cool>cool"; + let p = EnrichedTextParser::new(data); + assert_eq!( + p.collect::<Vec<Token>>(), + vec![ + Token::Text(b"Stay "), + Token::Start(Tag::Unrecognized), + Token::Text(b"cool"), + ] + ) + } + #[test] + fn test_escapes() { + let data = b"Hello\r\nWorld<<Universe"; + let p = EnrichedTextParser::new(data); + assert_eq!( + p.collect::<Vec<Token>>(), + vec![ + Token::Text(b"Hello"), + Token::CRLF, + Token::Text(b"World"), + Token::LT, + Token::Text(b"Universe") + ] + ); + } +}