From 62c083b5e3a164d596b49132c8c53248aa2daf42 Mon Sep 17 00:00:00 2001 From: Teddy Wing Date: Sun, 14 Mar 2021 17:24:18 +0100 Subject: Strip HTML tags from single-part HTML emails When an HTML body is fed to 'whatlang', it recognises it as English. This is likely due to the English HTML syntax. Remove all HTML tags with a simple regex substitution to get the language recognition working more properly. This doesn't remove CSS, which could also confuse the language recogniser. In a limited test, it seemed to work without having to remove any CSS, so not bothering with that. Still need to get this working for multipart emails. --- Cargo.lock | 33 +++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/main.rs | 11 ++++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index e17605c..1aab5af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,6 +6,15 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" +[[package]] +name = "aho-corasick" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +dependencies = [ + "memchr", +] + [[package]] name = "autocfg" version = "1.0.1" @@ -85,12 +94,19 @@ dependencies = [ "quoted_printable", ] +[[package]] +name = "memchr" +version = "2.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" + [[package]] name = "ottolangy" version = "0.0.1" dependencies = [ "exitcode", "mailparse", + "regex", "thiserror", "whatlang", "xdg", @@ -120,6 +136,23 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b080c5db639b292ac79cbd34be0cfc5d36694768d8341109634d90b86930e2" +[[package]] +name = "regex" +version = "1.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54fd1046a3107eb58f42de31d656fee6853e5d276c455fd943742dce89fc3dd3" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" + [[package]] name = "syn" version = "1.0.63" diff --git a/Cargo.toml b/Cargo.toml index ba30916..d64f53d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] exitcode = "1.1.2" mailparse = "0.13.2" +regex = "1.4.4" thiserror = "1.0.24" whatlang = "0.11.1" xdg = "2.2.0" diff --git a/src/main.rs b/src/main.rs index fbb6976..656403a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,7 @@ use exitcode; use mailparse; +use regex::Regex; use thiserror::Error; use whatlang::{self, Lang}; use xdg; @@ -109,6 +110,7 @@ fn run() -> Result<(), OttolangyError> { let lang_info = whatlang::detect(&body) .ok_or(OttolangyError::DetectLanguage)?; + println!("lang: {:?}", lang_info); let attribution_config = if lang_info.lang() == Lang::Fra { ATTRIBUTION_FR } else { @@ -131,7 +133,14 @@ fn get_email_body(email: &[u8]) -> Result { println!("ctype: {:?}", email.ctype); if email.subparts.is_empty() { - let body = email.get_body()?; + let mut body = email.get_body()?; + + if email.ctype.mimetype == "text/html" { + let re = Regex::new("<[^>]*>").unwrap(); + body = re.replace_all(&body, "").into_owned(); + } + + println!("body: {:?}", body); return Ok(body); } -- cgit v1.2.3