From 62c083b5e3a164d596b49132c8c53248aa2daf42 Mon Sep 17 00:00:00 2001 From: Teddy Wing Date: Sun, 14 Mar 2021 17:24:18 +0100 Subject: Strip HTML tags from single-part HTML emails When an HTML body is fed to 'whatlang', it recognises it as English. This is likely due to the English HTML syntax. Remove all HTML tags with a simple regex substitution to get the language recognition working more properly. This doesn't remove CSS, which could also confuse the language recogniser. In a limited test, it seemed to work without having to remove any CSS, so not bothering with that. Still need to get this working for multipart emails. --- src/main.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/main.rs b/src/main.rs index fbb6976..656403a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,7 @@ use exitcode; use mailparse; +use regex::Regex; use thiserror::Error; use whatlang::{self, Lang}; use xdg; @@ -109,6 +110,7 @@ fn run() -> Result<(), OttolangyError> { let lang_info = whatlang::detect(&body) .ok_or(OttolangyError::DetectLanguage)?; + println!("lang: {:?}", lang_info); let attribution_config = if lang_info.lang() == Lang::Fra { ATTRIBUTION_FR } else { @@ -131,7 +133,14 @@ fn get_email_body(email: &[u8]) -> Result { println!("ctype: {:?}", email.ctype); if email.subparts.is_empty() { - let body = email.get_body()?; + let mut body = email.get_body()?; + + if email.ctype.mimetype == "text/html" { + let re = Regex::new("<[^>]*>").unwrap(); + body = re.replace_all(&body, "").into_owned(); + } + + println!("body: {:?}", body); return Ok(body); } -- cgit v1.2.3