aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorTeddy Wing2021-03-14 17:24:18 +0100
committerTeddy Wing2021-03-14 17:24:18 +0100
commit62c083b5e3a164d596b49132c8c53248aa2daf42 (patch)
tree7d6ef1866c54e09a9a5b9716f779934cb6bd29b4 /src
parent7d46438c015e400ca6c035f5d99da040e6765740 (diff)
downloadmutt-ottolangy-62c083b5e3a164d596b49132c8c53248aa2daf42.tar.bz2
Strip HTML tags from single-part HTML emails
When an HTML body is fed to 'whatlang', it recognises it as English. This is likely due to the English HTML syntax. Remove all HTML tags with a simple regex substitution to get the language recognition working more properly. This doesn't remove CSS, which could also confuse the language recogniser. In a limited test, it seemed to work without having to remove any CSS, so not bothering with that. Still need to get this working for multipart emails.
Diffstat (limited to 'src')
-rw-r--r--src/main.rs11
1 files changed, 10 insertions, 1 deletions
diff --git a/src/main.rs b/src/main.rs
index fbb6976..656403a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,6 +15,7 @@
use exitcode;
use mailparse;
+use regex::Regex;
use thiserror::Error;
use whatlang::{self, Lang};
use xdg;
@@ -109,6 +110,7 @@ fn run() -> Result<(), OttolangyError> {
let lang_info = whatlang::detect(&body)
.ok_or(OttolangyError::DetectLanguage)?;
+ println!("lang: {:?}", lang_info);
let attribution_config = if lang_info.lang() == Lang::Fra {
ATTRIBUTION_FR
} else {
@@ -131,7 +133,14 @@ fn get_email_body(email: &[u8]) -> Result<String, WrapError> {
println!("ctype: {:?}", email.ctype);
if email.subparts.is_empty() {
- let body = email.get_body()?;
+ let mut body = email.get_body()?;
+
+ if email.ctype.mimetype == "text/html" {
+ let re = Regex::new("<[^>]*>").unwrap();
+ body = re.replace_all(&body, "").into_owned();
+ }
+
+ println!("body: {:?}", body);
return Ok(body);
}