diff options
| author | Teddy Wing | 2021-03-14 17:24:18 +0100 | 
|---|---|---|
| committer | Teddy Wing | 2021-03-14 17:24:18 +0100 | 
| commit | 62c083b5e3a164d596b49132c8c53248aa2daf42 (patch) | |
| tree | 7d6ef1866c54e09a9a5b9716f779934cb6bd29b4 /src | |
| parent | 7d46438c015e400ca6c035f5d99da040e6765740 (diff) | |
| download | mutt-ottolangy-62c083b5e3a164d596b49132c8c53248aa2daf42.tar.bz2 | |
Strip HTML tags from single-part HTML emails
When an HTML body is fed to 'whatlang', it recognises it as English.
This is likely due to the English HTML syntax. Remove all HTML tags with
a simple regex substitution to get the language recognition working more
properly.
This doesn't remove CSS, which could also confuse the language
recogniser. In a limited test, it seemed to work without having to
remove any CSS, so not bothering with that.
Still need to get this working for multipart emails.
Diffstat (limited to 'src')
| -rw-r--r-- | src/main.rs | 11 | 
1 files changed, 10 insertions, 1 deletions
| diff --git a/src/main.rs b/src/main.rs index fbb6976..656403a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,7 @@  use exitcode;  use mailparse; +use regex::Regex;  use thiserror::Error;  use whatlang::{self, Lang};  use xdg; @@ -109,6 +110,7 @@ fn run() -> Result<(), OttolangyError> {      let lang_info = whatlang::detect(&body)          .ok_or(OttolangyError::DetectLanguage)?; +    println!("lang: {:?}", lang_info);      let attribution_config = if lang_info.lang() == Lang::Fra {          ATTRIBUTION_FR      } else { @@ -131,7 +133,14 @@ fn get_email_body(email: &[u8]) -> Result<String, WrapError> {      println!("ctype: {:?}", email.ctype);      if email.subparts.is_empty() { -        let body = email.get_body()?; +        let mut body = email.get_body()?; + +        if email.ctype.mimetype == "text/html" { +            let re = Regex::new("<[^>]*>").unwrap(); +            body = re.replace_all(&body, "").into_owned(); +        } + +        println!("body: {:?}", body);          return Ok(body);      } | 
