commit 63fdfbdbbbc6d80c3fa9dbd8558b6ce57e365322
Author: alex wennerberg <alex@alexwennerberg.com>
Date: Sat, 8 Jan 2022 19:34:40 -0800
Initial commit
Diffstat:
5 files changed, 233 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/Cargo.lock b/Cargo.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "enriched-text"
+version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,4 @@
+[package]
+name = "enriched-text"
+version = "0.1.0"
+edition = "2018"
diff --git a/README b/README
@@ -0,0 +1,6 @@
+text/enriched parser
+===================
+
+Parser for text/enriched, as defined in https://datatracker.ietf.org/doc/html/rfc1523
+
+In "alpha"
diff --git a/src/lib.rs b/src/lib.rs
@@ -0,0 +1,215 @@
+//! A parser for the text/enriched format, as described in
+//! https://datatracker.ietf.org/doc/html/rfc1523
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum Token<'a> {
+ // returns bytes in order to be encoding-agnostic
+ Text(&'a [u8]),
+ CRLF,
+ LT,
+ End(Tag),
+ Start(Tag),
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum Tag {
+ // Font-alteration commands
+ Bold,
+ Italic,
+ Fixed,
+ Smaller,
+ Bigger,
+ Underline,
+ // Justification commands
+ // Note to library users: inner tag takes precedence
+ Center,
+ FlushLeft,
+ FlushRight,
+ // Indendation commands
+ Indent,
+ IndentRight,
+ // Miscellaneous commands
+ Excerpt,
+ Verbatim,
+ Nofill,
+ Param,
+ // Unrecognized
+ Unrecognized,
+}
+
+struct EnrichedTextParser<'a> {
+ cursor: usize,
+ data: &'a [u8],
+ in_tag: bool,
+ CRLF: bool,
+ LT: bool,
+}
+
+// escape only <
+impl<'a> EnrichedTextParser<'a> {
+ fn new(s: &'a [u8]) -> Self {
+ return Self {
+ cursor: 0,
+ data: s,
+ in_tag: false,
+ CRLF: false,
+ LT: false,
+ };
+ }
+}
+
+impl<'a> Iterator for EnrichedTextParser<'a> {
+ type Item = Token<'a>;
+ fn next(&mut self) -> Option<Token<'a>> {
+ let start = self.cursor;
+ if self.data.len() == self.cursor {
+ return None;
+ }
+ // awkard
+ if self.CRLF {
+ self.CRLF = false;
+ self.cursor += 1;
+ return Some(Token::CRLF);
+ }
+ if self.LT {
+ self.LT = false;
+ self.cursor += 1;
+ return Some(Token::LT);
+ }
+ if self.in_tag {
+ while self.data[self.cursor] != b'>' && self.cursor < self.data.len() {
+ self.cursor += 1;
+ }
+ let tag = &self.data[start..self.cursor];
+ let t = match std::str::from_utf8(tag) {
+ Ok(p) => match p.to_lowercase().trim_start_matches("/") {
+ "bold" => Tag::Bold,
+ "italic" => Tag::Italic,
+ "fixed" => Tag::Fixed,
+ "smaller" => Tag::Smaller,
+ "bigger" => Tag::Bigger,
+ "underline" => Tag::Underline,
+ "center" => Tag::Center,
+ "flushleft" => Tag::FlushLeft,
+ "flushright" => Tag::FlushRight,
+ "indent" => Tag::Indent,
+ "indentright" => Tag::IndentRight,
+ "excerpt" => Tag::Excerpt,
+ "vertbatim" => Tag::Verbatim,
+ "nofill" => Tag::Nofill,
+ "param" => Tag::Param,
+ _ => Tag::Unrecognized,
+ },
+ Err(_) => Tag::Unrecognized,
+ };
+ self.in_tag = false;
+ self.cursor += 1;
+ if self.data[start] == b'/' {
+ return Some(Token::End(t));
+ } else {
+ return Some(Token::Start(t));
+ }
+ }
+ loop {
+ if self.cursor == self.data.len() {
+ return Some(Token::Text(&self.data[start..self.cursor]));
+ }
+ if self.data[self.cursor] == b'\r'
+ && self.cursor < self.data.len() + 1
+ && self.data[self.cursor + 1] == b'\n'
+ {
+ self.CRLF = true;
+ }
+ if self.data[self.cursor] == b'<' {
+ if self.cursor < self.data.len() - 1 && self.data[self.cursor + 1] == b'<' {
+ self.LT = true;
+ } else if self.data[self.cursor] == b'<' {
+ self.in_tag = true;
+ }
+ self.cursor += 1;
+ if self.cursor - 1 > start {
+ return Some(Token::Text(&self.data[start..self.cursor - 1]));
+ } else {
+ return self.next();
+ }
+ }
+ self.cursor += 1;
+ if self.CRLF || self.LT {
+ return Some(Token::Text(&self.data[start..self.cursor - 1]));
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ // Note this useful idiom: importing names from outer (for mod tests) scope.
+ use super::*;
+
+ #[test]
+ fn test_simple() {
+ let data = b"Hello world!";
+ let p = EnrichedTextParser::new(data);
+ assert_eq!(p.collect::<Vec<Token>>(), vec![Token::Text(data)]);
+ }
+
+ #[test]
+ fn test_tag() {
+ let data = b"Hello <Bold>world!</Bold>";
+ let p = EnrichedTextParser::new(data);
+ assert_eq!(
+ p.collect::<Vec<Token>>(),
+ vec![
+ Token::Text(b"Hello "),
+ Token::Start(Tag::Bold),
+ Token::Text(b"world!"),
+ Token::End(Tag::Bold)
+ ]
+ );
+ }
+ #[test]
+ fn test_nested_tag() {
+ let data = b"Hello <indent>beautiful <bold>world!</bold></indent>";
+ let p = EnrichedTextParser::new(data);
+ assert_eq!(
+ p.collect::<Vec<Token>>(),
+ vec![
+ Token::Text(b"Hello "),
+ Token::Start(Tag::Indent),
+ Token::Text(b"beautiful "),
+ Token::Start(Tag::Bold),
+ Token::Text(b"world!"),
+ Token::End(Tag::Bold),
+ Token::End(Tag::Indent)
+ ]
+ );
+ }
+ #[test]
+ fn test_incomplete_tag() {
+ let data = b"Stay <cool>cool";
+ let p = EnrichedTextParser::new(data);
+ assert_eq!(
+ p.collect::<Vec<Token>>(),
+ vec![
+ Token::Text(b"Stay "),
+ Token::Start(Tag::Unrecognized),
+ Token::Text(b"cool"),
+ ]
+ )
+ }
+ #[test]
+ fn test_escapes() {
+ let data = b"Hello\r\nWorld<<Universe";
+ let p = EnrichedTextParser::new(data);
+ assert_eq!(
+ p.collect::<Vec<Token>>(),
+ vec![
+ Token::Text(b"Hello"),
+ Token::CRLF,
+ Token::Text(b"World"),
+ Token::LT,
+ Token::Text(b"Universe")
+ ]
+ );
+ }
+}