gmi2html

library and cli tool to convert gemtext to HTML
git clone git://git.alexwennerberg.com/gmi2html
Log | Files | Refs | README

commit 3b5d061eeee13cca8846af40f76667075c0bd2b4
Author: alex wennerberg <alex@alexwennerberg.com>
Date:   Sun, 22 May 2022 16:46:07 -0700

Initial commit

I deleted this a while ago, reviving the library

Diffstat:
A.gitignore | 1+
ACargo.lock | 85+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ACargo.toml | 12++++++++++++
Amain.rs | 13+++++++++++++
Asrc/lib.rs | 323+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 434 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/Cargo.lock b/Cargo.lock @@ -0,0 +1,85 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "form_urlencoded" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +dependencies = [ + "matches", + "percent-encoding", +] + +[[package]] +name = "gmi2html" +version = "0.1.6" +dependencies = [ + "url", +] + +[[package]] +name = "idna" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +dependencies = [ + "matches", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "matches" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "unicode-bidi" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" + +[[package]] +name = "unicode-normalization" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "url" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +dependencies = [ + "form_urlencoded", + "idna", + "matches", + "percent-encoding", +] diff --git a/Cargo.toml b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +authors = ["alex wennerberg <alex@alexwennerberg.com>"] +description = "Convert text/gemini into HTML" +edition = "2018" +license = "MIT" +name = "gmi2html" +readme = "README.md" +repository = "https://git.alexwennerberg.com/gmi2html" +version = "0.1.6" + +[dependencies] +url = "*" diff --git a/main.rs b/main.rs @@ -0,0 +1,13 @@ +use gmi2html::GeminiConverter; +use std::io::{self, Read}; + +fn main() { + let mut buffer = String::new(); + // Basic CLI with hard-coded defaults TODO add cli + io::stdin().read_to_string(&mut buffer).unwrap(); + let res = GeminiConverter::new(&buffer) + .proxy_url("https://portal.mozz.us/gemini/") + .inline_images(true) + .to_html(); + println!("{}", res); +} diff --git a/src/lib.rs b/src/lib.rs @@ -0,0 +1,323 @@ +//! An implementation of gmi -> HTML conversion, based on +//! the [text/gemini](https://gemini.circumlunar.space/docs/specification.html) spec v0.14.2 +//! +//! Example usage: +//! ``` +//! use gmi2html::GeminiConverter; +//! +//!let res = GeminiConverter::new(r#" +//! ## Hello, Gemini +//! Lorem Ipseum +//! => gemini://gemini.circumlunar.space +//! "#) +//! .proxy_url("https://portal.mozz.us/gemini/") +//! .inline_images(true) +//! .to_html(); +//! ``` + +use std::collections::HashSet; +use url::{ParseError, Url}; + +static ALLOWED_SCHEMES: &[&str] = &["https", "http", "gemini", "gopher", "mailto"]; + +// All 4 characters for efficiency +static IMAGE_EXTENSIONS: &[&str] = &[".jpg", "jpeg", ".png", ".gif", ".ico", ".svg", "webp"]; + +pub struct GeminiConverter<'a> { + proxy_url: Option<Url>, + allowed_schemes: HashSet<String>, + // TODO allow disallowed configuration + input_text: &'a str, + inline_images: bool, +} + +impl<'a> GeminiConverter<'a> { + /// Initialize the builder with default configuration values. + pub fn new(gmi_text: &'a str) -> Self { + Self { + proxy_url: None, + allowed_schemes: ALLOWED_SCHEMES.iter().map(|a| a.to_string()).collect(), // inefficient + input_text: gmi_text, + inline_images: false, + } + } + + /// Replace `gemini://` in URLS with this prefix for proxying, i.e. over HTTP. Requires trailing slash. + pub fn proxy_url(&mut self, proxy_url: &'a str) -> &mut Self { + self.proxy_url = Some(Url::parse(proxy_url).unwrap()); + self + } + + /// Render relative-path images in-line. Default false. Beware that this can expose you + /// to security issues if you're not careful (e.g. malicious SVG) + pub fn inline_images(&mut self, option: bool) -> &mut Self { + self.inline_images = option; + self + } + + /// Applied before proxy_url. + pub fn allowed_schemes(&mut self, allowed: &'a [&'a str]) -> &mut Self { + // Applied before proxy_url + self.allowed_schemes = allowed.iter().map(|a| a.to_string()).collect(); + self + } + + /// Convert Gemini text to HTML. + pub fn to_html(&self) -> String { + // This function sometimes priorities performance over readability + let mut output = String::new(); + let mut is_pre = false; + let mut is_list = false; + for line in self.input_text.lines() { + // See 5.4.3 "Preformatting toggle lines" + if line.starts_with("```") { + is_pre = !is_pre; + if is_pre { + if line.len() > 3 { + // This is marginally faster than using format!, albeit a bit uglier + output.push_str("<pre alt=\""); + xml_safe(&mut output, &line[3..]); + output.push_str("\">\n"); + } else { + output.push_str("<pre>\n"); + } + } else { + output.push_str("</pre>\n") + } + continue; + } + if is_pre { + xml_safe(&mut output, line); + output.push('\n'); + continue; + } + // See 5.5.2 "Unordered list items" + if line.starts_with("* ") { + if !is_list { + output.push_str("<ul>\n"); + is_list = true; + } + output.push_str("<li>"); + xml_safe(&mut output, &line[2..].trim()); + output.push_str("</li>\n"); + continue; + } else { + if is_list { + output.push_str("</ul>\n"); + } + is_list = false; + } + // 5.5.1 heading lines + if line.starts_with("#") { + let mut count = 0; + for ch in line.chars() { + if ch == '#' { + count += 1; + // Limit to 3 headers. + if count == 3 { + break; + } + } + } + // String allocation for readability + output.push_str(&format!("<h{}>", count)); + xml_safe(&mut output, &line[count..].trim()); + output.push_str(&format!("</h{}>\n", count)); + // 5.5.3 Quote lines + } else if line.starts_with(">") { + output.push_str("<q>"); + xml_safe(&mut output, &line[1..]); + output.push_str("</q><br>\n"); + } else if line.starts_with("=>") { + let mut i = line[2..].split_whitespace(); + let first: &str = i.next().unwrap_or(""); + // inefficient + let second: String = i.collect::<Vec<&str>>().join(" "); + // This is much slower than surrounding code + // TODO consider blacklist + let parsed = Url::parse(first); + let mut is_image = false; + if parsed == Err(ParseError::RelativeUrlWithoutBase) { + let extension: &str = &first[first.len() - 4..first.len()].to_ascii_lowercase(); + if self.inline_images && IMAGE_EXTENSIONS.contains(&extension) { + output.push_str("<img src=\""); + is_image = true; + } else { + output.push_str("<a href=\""); + } + let relative_url = String::new(); + xml_safe(&mut output, first); + output.push_str(&relative_url); + } else { + output.push_str("<a href=\""); + } + if let Ok(p) = parsed { + if self.allowed_schemes.contains(p.scheme()) { + if p.scheme() == "gemini" { + // TODO FIX + if let Some(s) = &self.proxy_url { + // Never fail, just use blank string if cant parse + let join = |a: &Url, b: Url| -> Result<String, Box<dyn std::error::Error>> { + Ok(a.join(b.host_str().ok_or("err")?)?.join(b.path())?.as_str().to_string()) + }; + let proxied = join(s, p).unwrap_or("".to_string()); // Dont fail + output.push_str(&proxied); + } else { + output.push_str(p.as_str()); + } + } else { + output.push_str(p.as_str()); + } + } + } + let link_text = match second.as_str() { + "" => first, + t => t, + }; + if !is_image { + output.push_str("\">"); + xml_safe(&mut output, link_text); + output.push_str("</a>"); + } else { + output.push_str("\" alt=\""); + xml_safe(&mut output, link_text); + output.push_str("\">"); + } + output.push_str("<br>\n"); + } else { + xml_safe(&mut output, line); + output.push_str("<br>\n"); + } + } + // Check outstanding tags that need to be closed + if is_list { + output.push_str("</ul>"); + } + if is_pre { + output.push_str("</pre>") + } + return output; + } +} + +pub fn xml_safe(dest: &mut String, text: &str) { + for c in text.chars() { + match c { + '&' => dest.push_str("&amp;"), + '<' => dest.push_str("&lt;"), + '>' => dest.push_str("&gt;"), + '"' => dest.push_str("&quot;"), + '\'' => dest.push_str("&#39;"), + _ => dest.push(c), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_basic() { + assert_eq!( + GeminiConverter::new("hello world").to_html(), + "hello world<br>\n" + ) + } + + #[test] + fn test_unsafe_html() { + assert_eq!( + GeminiConverter::new("<b>hacked</b>").to_html(), + "&lt;b&gt;hacked&lt;/b&gt;<br>\n" + ); + // TODO add more tests + } + + #[test] + fn test_whitespace() { + assert_eq!( + GeminiConverter::new("\n\n\n").to_html(), + "<br>\n<br>\n<br>\n" + ) + } + + #[test] + fn test_list() { + assert_eq!( + GeminiConverter::new("hi\n* cool\n* vibes\nok").to_html(), + "hi<br>\n<ul>\n<li>cool</li>\n<li>vibes</li>\n</ul>\nok<br>\n" + ) + } + + #[test] + fn test_quote() { + assert_eq!( + GeminiConverter::new("> stay cool\n-coolguy").to_html(), + "<q> stay cool</q><br>\n-coolguy<br>\n" + ) + } + #[test] + fn test_headers() { + assert_eq!( + GeminiConverter::new("#header").to_html(), + "<h1>header</h1>\n" + ); + assert_eq!( + GeminiConverter::new("##header").to_html(), + "<h2>header</h2>\n" + ); + assert_eq!( + GeminiConverter::new("### header").to_html(), + "<h3>header</h3>\n" + ); + assert_eq!( + GeminiConverter::new("####header").to_html(), + "<h3>#header</h3>\n" + ); + } + + #[test] + fn test_pre() { + assert_eq!( + GeminiConverter::new("```\nhello world\n```").to_html(), + "<pre>\nhello world\n</pre>\n" + ); + } + + #[test] + fn test_pre_alt() { + assert_eq!( + GeminiConverter::new("```alt\"\nhello world\n```").to_html(), + "<pre alt=\"alt&quot;\">\nhello world\n</pre>\n" + ); + } + + #[test] + fn test_hyperlink() { + assert_eq!( + // TODO resolve trailing slash issue + GeminiConverter::new("=> https://google.com").to_html(), + "<a href=\"https://google.com/\">https://google.com</a><br>\n" + ) + } + + #[test] + fn test_replace_image() { + assert_eq!( + GeminiConverter::new("=> something.jpg cool pic") + .inline_images(true) + .to_html(), + "<img src=\"something.jpg\" alt=\"cool pic\"><br>\n" + ) + } + + #[test] + fn test_proxy() { + assert_eq!( + GeminiConverter::new("=> gemini://alexwrites.xyz") + .proxy_url("https://flounder.online/proxy/") + .to_html(), + "<a href=\"https://flounder.online/proxy/alexwrites.xyz\">gemini://alexwrites.xyz</a><br>\n" + ) + } +}