enriched-text

Rust parser for text/enriched as defined by rfc1523
git clone git://git.alexwennerberg.com/enriched-text
Log | Files | Refs | README | LICENSE

lib.rs (8711B)


      1 //! A parser for the text/enriched format, as described in
      2 //! https://datatracker.ietf.org/doc/html/rfc1896
      3 
      4 #[derive(Debug, Eq, PartialEq)]
      5 pub enum Token {
      6     // returns bytes in order to be encoding-agnostic
      7     Text(Vec<u8>),
      8     End(Tag),
      9     Start(Tag),
     10 }
     11 
     12 #[derive(Debug, Eq, PartialEq)]
     13 pub enum Tag {
     14     // Font-alteration commands
     15     Bold,
     16     Italic,
     17     Fixed,
     18     Smaller,
     19     Bigger,
     20     Underline,
     21     // Justification commands
     22     // Note to library users: inner tag takes precedence
     23     Center,
     24     FlushLeft,
     25     FlushRight,
     26     // Indendation commands
     27     Indent,
     28     IndentRight,
     29     // Miscellaneous commands
     30     Excerpt,
     31     Verbatim,
     32     Nofill,
     33     Param,
     34     // Unrecognized
     35     Unrecognized(String),
     36 }
     37 
     38 pub struct EnrichedTextParser<'a> {
     39     cursor: usize,
     40     data: &'a [u8],
     41     next: Option<Token>,
     42 }
     43 
     44 // escape only <
     45 impl<'a> EnrichedTextParser<'a> {
     46     pub fn new(s: &'a [u8]) -> Self {
     47         return Self {
     48             cursor: 0,
     49             data: s,
     50             next: None,
     51         };
     52     }
     53 }
     54 
     55 impl<'a> EnrichedTextParser<'a> {
     56     /// Function to try and parse a formatting command at the current cursor
     57     /// position. Iff a command is found, returns `Some` with `Token::Start` or
     58     /// `Token::End` respectively. If no command is found, returns `None`.
     59     ///
     60     /// The cursor is only advanced if a command was found.
     61     /// ### Panics
     62     /// Panics if the cursor is at EOF.
     63     fn try_parse_command(&mut self) -> Option<Token> {
     64         let start = self.cursor;
     65         if self.data[self.cursor] == b'<' {
     66             self.cursor += 1;
     67             // maybe a tag
     68             let is_end = if self.data[self.cursor] == b'/' {
     69                 self.cursor += 1;
     70                 true
     71             } else {
     72                 false
     73             };
     74 
     75             /*
     76             each formatting command may be no longer than 60 chars
     77 
     78             subtract 1 from the length so there will be at least 1 more byte
     79             at the end of the data for the '>'
     80             */
     81             let end = usize::min(self.data.len() - 1, self.cursor + 60);
     82 
     83             let command_len = self.data[self.cursor..end]
     84                 .iter()
     85                 .take_while(|byte| matches!(byte, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9'))
     86                 .count();
     87 
     88             if command_len > 0 && self.data[self.cursor + command_len] == b'>' {
     89                 // indeed a formatting command
     90                 let command =
     91                     std::str::from_utf8(&self.data[self.cursor..self.cursor + command_len])
     92                         // its safe to unwrap because we know the command consists of
     93                         // only ASCII alphanumerics from parsing it above
     94                         .unwrap()
     95                         // commands are case insensitive
     96                         .to_ascii_lowercase();
     97 
     98                 let command = match command.as_str() {
     99                     "bold" => Tag::Bold,
    100                     "italic" => Tag::Italic,
    101                     "fixed" => Tag::Fixed,
    102                     "smaller" => Tag::Smaller,
    103                     "bigger" => Tag::Bigger,
    104                     "underline" => Tag::Underline,
    105                     "center" => Tag::Center,
    106                     "flushleft" => Tag::FlushLeft,
    107                     "flushright" => Tag::FlushRight,
    108                     "indent" => Tag::Indent,
    109                     "indentright" => Tag::IndentRight,
    110                     "excerpt" => Tag::Excerpt,
    111                     "vertbatim" => Tag::Verbatim,
    112                     // TODO nofill should skip the CRLF processing
    113                     "nofill" => Tag::Nofill,
    114                     // TODO param should ??? skip entirely?
    115                     "param" => Tag::Param,
    116                     _ => Tag::Unrecognized(command),
    117                 };
    118 
    119                 // also count '>'
    120                 self.cursor += command_len + 1;
    121 
    122                 return if is_end {
    123                     Some(Token::End(command))
    124                 } else {
    125                     Some(Token::Start(command))
    126                 };
    127             }
    128         }
    129 
    130         // reset the cursor, maybe we fell through above
    131         self.cursor = start;
    132         None
    133     }
    134 
    135     /// Processes text to handle CRLFs specially.
    136     fn process_text(text: &'a [u8]) -> Vec<u8> {
    137         let mut out = Vec::new();
    138         let mut lf_count = 0;
    139         let mut i = 0;
    140 
    141         while i < text.len() {
    142             if text[i] == b'\r' && text.get(i + 1) == Some(&b'\n') {
    143                 lf_count += 1;
    144                 i += 2;
    145                 continue;
    146             } else if text[i] == b'\n' {
    147                 // also handle text with LF line endings gracefully
    148                 lf_count += 1;
    149                 i += 1;
    150                 continue;
    151             }
    152             // replace the line feeds appropriately
    153             if lf_count > 1 {
    154                 for _ in 0..lf_count - 1 {
    155                     out.push(b'\r');
    156                     out.push(b'\n');
    157                 }
    158                 lf_count = 0;
    159             } else if lf_count == 1 {
    160                 out.push(b' ');
    161                 lf_count = 0;
    162             }
    163 
    164             if text[i] == b'<' && text.get(i + 1) == Some(&b'<') {
    165                 // skip escaped '<'
    166                 i += 1;
    167             }
    168             out.push(text[i]);
    169             i += 1;
    170         }
    171 
    172         out
    173     }
    174 }
    175 
    176 impl<'a> Iterator for EnrichedTextParser<'a> {
    177     type Item = Token;
    178 
    179     fn next(&mut self) -> Option<Token> {
    180         // return any command tokens we already parsed
    181         if let Some(token) = self.next.take() {
    182             return Some(token);
    183         }
    184 
    185         let start = self.cursor;
    186         while self.cursor < self.data.len() {
    187             let maybe_end = self.cursor;
    188             if let Some(command) = self.try_parse_command() {
    189                 // A command starts here, but first we have to return the text
    190                 // if there is any.
    191                 return Some(if start != maybe_end {
    192                     // there is some text, store the parsed command away for
    193                     // the next invocation
    194                     self.next = Some(command);
    195                     Token::Text(Self::process_text(&self.data[start..maybe_end]))
    196                 } else {
    197                     command
    198                 });
    199             } else {
    200                 self.cursor += 1;
    201             }
    202         }
    203 
    204         // must have hit EOF
    205         if self.cursor != start {
    206             Some(Token::Text(Self::process_text(&self.data[start..])))
    207         } else {
    208             None
    209         }
    210     }
    211 }
    212 
    213 #[cfg(test)]
    214 mod tests {
    215     use super::*;
    216 
    217     #[test]
    218     fn test_simple() {
    219         let data = b"Hello world!";
    220         let p = EnrichedTextParser::new(data);
    221         assert_eq!(p.collect::<Vec<Token>>(), vec![Token::Text(data.to_vec())]);
    222     }
    223 
    224     #[test]
    225     fn test_empty() {
    226         let data = b"";
    227         let p = EnrichedTextParser::new(data);
    228         assert_eq!(p.collect::<Vec<Token>>(), vec![]);
    229     }
    230 
    231     #[test]
    232     fn test_tag() {
    233         let data = b"Hello <Bold>world!</Bold>";
    234         let p = EnrichedTextParser::new(data);
    235         assert_eq!(
    236             p.collect::<Vec<Token>>(),
    237             vec![
    238                 Token::Text(b"Hello ".to_vec()),
    239                 Token::Start(Tag::Bold),
    240                 Token::Text(b"world!".to_vec()),
    241                 Token::End(Tag::Bold)
    242             ]
    243         );
    244     }
    245     #[test]
    246     fn test_nested_tag() {
    247         let data = b"Hello <indent>beautiful <bold>world!</bold></indent>";
    248         let p = EnrichedTextParser::new(data);
    249         assert_eq!(
    250             p.collect::<Vec<Token>>(),
    251             vec![
    252                 Token::Text(b"Hello ".to_vec()),
    253                 Token::Start(Tag::Indent),
    254                 Token::Text(b"beautiful ".to_vec()),
    255                 Token::Start(Tag::Bold),
    256                 Token::Text(b"world!".to_vec()),
    257                 Token::End(Tag::Bold),
    258                 Token::End(Tag::Indent)
    259             ]
    260         );
    261     }
    262     #[test]
    263     fn test_incomplete_tag() {
    264         let data = b"Stay</broken><cool>cool";
    265         let p = EnrichedTextParser::new(data);
    266         assert_eq!(
    267             p.collect::<Vec<Token>>(),
    268             vec![
    269                 Token::Text(b"Stay".to_vec()),
    270                 Token::End(Tag::Unrecognized("broken".into())),
    271                 Token::Start(Tag::Unrecognized("cool".into())),
    272                 Token::Text(b"cool".to_vec()),
    273             ]
    274         )
    275     }
    276     #[test]
    277     fn test_escapes() {
    278         let data = b"Hello\r\nWorld<<\r\n\r\nUniverse\n";
    279         let p = EnrichedTextParser::new(data);
    280         assert_eq!(
    281             p.collect::<Vec<Token>>(),
    282             vec![Token::Text(b"Hello World<\r\nUniverse\n".to_vec()),]
    283         );
    284     }
    285 }