diff options
Diffstat (limited to 'parser/src/conversion')
| -rw-r--r-- | parser/src/conversion/block.rs | 202 | ||||
| -rw-r--r-- | parser/src/conversion/inline.rs | 160 | ||||
| -rw-r--r-- | parser/src/conversion/tests.rs | 65 |
3 files changed, 427 insertions, 0 deletions
diff --git a/parser/src/conversion/block.rs b/parser/src/conversion/block.rs new file mode 100644 index 0000000..ab18c48 --- /dev/null +++ b/parser/src/conversion/block.rs @@ -0,0 +1,202 @@ +use failure::{Error,bail}; +use pest::iterators::Pair; + +use document_tree::{ + Element,HasChildren,ExtraAttributes, + elements as e, + element_categories as c, + extra_attributes as a, + attribute_types as at +}; + +use crate::{ + pest_rst::Rule, + pair_ext_parse::PairExt, +}; +use super::{whitespace_normalize_name, inline::convert_inlines}; + + +#[derive(PartialEq)] +pub(super) enum TitleKind { Double(char), Single(char) } + +pub(super) enum TitleOrSsubel { + Title(e::Title, TitleKind), + Ssubel(c::StructuralSubElement), +} + + +pub(super) fn convert_ssubel(pair: Pair<Rule>) -> Result<Option<TitleOrSsubel>, Error> { + use self::TitleOrSsubel::*; + Ok(Some(match pair.as_rule() { + Rule::title => { let (t, k) = convert_title(pair)?; Title(t, k) }, + //TODO: subtitle, decoration, docinfo + Rule::EOI => return Ok(None), + _ => Ssubel(convert_substructure(pair)?.into()), + })) +} + + +fn convert_substructure(pair: Pair<Rule>) -> Result<c::SubStructure, Error> { + Ok(match pair.as_rule() { + // todo: Topic, Sidebar, Transition + // no section here, as it’s constructed from titles + _ => convert_body_elem(pair)?.into(), + }) +} + + +fn convert_body_elem(pair: Pair<Rule>) -> Result<c::BodyElement, Error> { + Ok(match pair.as_rule() { + Rule::paragraph => convert_paragraph(pair)?.into(), + Rule::target => convert_target(pair)?.into(), + Rule::substitution_def => convert_substitution_def(pair)?.into(), + Rule::admonition_gen => convert_admonition_gen(pair)?.into(), + Rule::image => convert_image::<e::Image>(pair)?.into(), + Rule::bullet_list => convert_bullet_list(pair)?.into(), + rule => unimplemented!("unhandled rule {:?}", rule), + }) +} + + +fn convert_title(pair: Pair<Rule>) -> Result<(e::Title, TitleKind), Error> { + let mut title: Option<String> = None; + let mut title_inlines: Option<Vec<c::TextOrInlineElement>> = None; + let mut adornment_char: Option<char> = None; + // title_double or title_single. Extract kind before consuming + let inner_pair = pair.into_inner().next().unwrap(); + let kind = inner_pair.as_rule(); + for p in inner_pair.into_inner() { + match p.as_rule() { + Rule::line => { + title = Some(p.as_str().to_owned()); + title_inlines = Some(convert_inlines(p)?); + }, + Rule::adornments => adornment_char = Some(p.as_str().chars().next().expect("Empty adornment?")), + rule => unimplemented!("Unexpected rule in title: {:?}", rule), + }; + } + // now we encountered one line of text and one of adornments + // TODO: emit error if the adornment line is too short (has to match title length) + let mut elem = e::Title::with_children(title_inlines.expect("No text in title")); + if let Some(title) = title { + //TODO: slugify properly + let slug = title.to_lowercase().replace("\n", "").replace(" ", "-"); + elem.names_mut().push(at::NameToken(slug)); + } + let title_kind = match kind { + Rule::title_double => TitleKind::Double(adornment_char.unwrap()), + Rule::title_single => TitleKind::Single(adornment_char.unwrap()), + _ => unreachable!(), + }; + Ok((elem, title_kind)) +} + + +fn convert_paragraph(pair: Pair<Rule>) -> Result<e::Paragraph, Error> { + Ok(e::Paragraph::with_children(convert_inlines(pair)?)) +} + + +fn convert_target(pair: Pair<Rule>) -> Result<e::Target, Error> { + let mut elem: e::Target = Default::default(); + elem.extra_mut().anonymous = false; + for p in pair.into_inner() { + match p.as_rule() { + Rule::target_name_uq | Rule::target_name_qu => { + elem.ids_mut().push(p.as_str().into()); + elem.names_mut().push(p.as_str().into()); + }, + // TODO: also handle non-urls + Rule::link_target => elem.extra_mut().refuri = Some(p.parse()?), + rule => panic!("Unexpected rule in target: {:?}", rule), + } + } + Ok(elem) +} + +fn convert_substitution_def(pair: Pair<Rule>) -> Result<e::SubstitutionDefinition, Error> { + let mut pairs = pair.into_inner(); + let name = whitespace_normalize_name(pairs.next().unwrap().as_str()); // Rule::substitution_name + let inner_pair = pairs.next().unwrap(); + let inner: Vec<c::TextOrInlineElement> = match inner_pair.as_rule() { + Rule::replace => convert_replace(inner_pair)?, + Rule::image => vec![convert_image::<e::ImageInline>(inner_pair)?.into()], + rule => panic!("Unknown substitution rule {:?}", rule), + }; + let mut subst_def = e::SubstitutionDefinition::with_children(inner); + subst_def.names_mut().push(at::NameToken(name)); + Ok(subst_def) +} + +fn convert_replace(pair: Pair<Rule>) -> Result<Vec<c::TextOrInlineElement>, Error> { + let mut pairs = pair.into_inner(); + let paragraph = pairs.next().unwrap(); + convert_inlines(paragraph) +} + +fn convert_image<I>(pair: Pair<Rule>) -> Result<I, Error> where I: Element + ExtraAttributes<a::Image> { + let mut pairs = pair.into_inner(); + let mut image = I::with_extra(a::Image::new( + pairs.next().unwrap().as_str().trim().parse()?, // line + )); + for opt in pairs { + let mut opt_iter = opt.into_inner(); + let opt_name = opt_iter.next().unwrap(); + let opt_val = opt_iter.next().unwrap(); + match opt_name.as_str() { + "class" => image.classes_mut().push(opt_val.as_str().to_owned()), + "name" => image.names_mut().push(opt_val.as_str().into()), + "alt" => image.extra_mut().alt = Some(opt_val.as_str().to_owned()), + "height" => image.extra_mut().height = Some(opt_val.parse()?), + "width" => image.extra_mut().width = Some(opt_val.parse()?), + "scale" => image.extra_mut().scale = Some(parse_scale(&opt_val)?), + "align" => image.extra_mut().align = Some(opt_val.parse()?), + "target" => image.extra_mut().target = Some(opt_val.parse()?), + name => bail!("Unknown Image option {}", name), + } + } + Ok(image) +} + +fn parse_scale(pair: &Pair<Rule>) -> Result<u8, Error> { + let input = if pair.as_str().chars().rev().next() == Some('%') { &pair.as_str()[..pair.as_str().len()-1] } else { pair.as_str() }; + use pest::error::{Error,ErrorVariant}; + Ok(input.parse().map_err(|e: std::num::ParseIntError| { + let var: ErrorVariant<Rule> = ErrorVariant::CustomError { message: e.to_string() }; + Error::new_from_span(var, pair.as_span()) + })?) +} + +fn convert_admonition_gen(pair: Pair<Rule>) -> Result<c::BodyElement, Error> { + let mut iter = pair.into_inner(); + let typ = iter.next().unwrap().as_str(); + // TODO: in reality it contains body elements. + let children: Vec<c::BodyElement> = iter.map(|p| e::Paragraph::with_children(vec![p.as_str().into()]).into()).collect(); + Ok(match typ { + "attention" => e::Attention::with_children(children).into(), + "hint" => e::Hint::with_children(children).into(), + "note" => e::Note::with_children(children).into(), + "caution" => e::Caution::with_children(children).into(), + "danger" => e::Danger::with_children(children).into(), + "error" => e::Error::with_children(children).into(), + "important" => e::Important::with_children(children).into(), + "tip" => e::Tip::with_children(children).into(), + "warning" => e::Warning::with_children(children).into(), + typ => panic!("Unknown admontion type {}!", typ), + }) +} + +fn convert_bullet_list(pair: Pair<Rule>) -> Result<e::BulletList, Error> { + Ok(e::BulletList::with_children(pair.into_inner().map(convert_bullet_item).collect::<Result<_, _>>()?)) +} + +fn convert_bullet_item(pair: Pair<Rule>) -> Result<e::ListItem, Error> { + let mut iter = pair.into_inner(); + let mut children: Vec<c::BodyElement> = vec![ + convert_paragraph(iter.next().unwrap())?.into() + ]; + for p in iter { + children.push(convert_body_elem(p)?); + } + Ok(e::ListItem::with_children(children)) +} diff --git a/parser/src/conversion/inline.rs b/parser/src/conversion/inline.rs new file mode 100644 index 0000000..6094714 --- /dev/null +++ b/parser/src/conversion/inline.rs @@ -0,0 +1,160 @@ +use failure::Error; +use pest::iterators::Pair; + +use document_tree::{ + HasChildren, + elements as e, + url::Url, + element_categories as c, + extra_attributes as a, + attribute_types as at, +}; + +use crate::{ + pest_rst::Rule, +// pair_ext_parse::PairExt, +}; +use super::whitespace_normalize_name; + + +pub fn convert_inline(pair: Pair<Rule>) -> Result<c::TextOrInlineElement, Error> { + Ok(match pair.as_rule() { + Rule::str | Rule::str_nested => pair.as_str().into(), + Rule::ws_newline => " ".to_owned().into(), + Rule::reference => convert_reference(pair)?, + Rule::substitution_name => convert_substitution_ref(pair)?.into(), + Rule::emph => e::Emphasis::with_children(convert_inlines(pair)?).into(), + Rule::strong => e::Strong::with_children(convert_inlines(pair)?).into(), + Rule::literal => e::Literal::with_children(convert_inlines(pair)?).into(), + rule => unimplemented!("unknown rule {:?}", rule), + }) +} + +pub fn convert_inlines(pair: Pair<Rule>) -> Result<Vec<c::TextOrInlineElement>, Error> { + pair.into_inner().map(convert_inline).collect() +} + +fn convert_reference(pair: Pair<Rule>) -> Result<c::TextOrInlineElement, Error> { + let name; + let refuri; + let refid; + let mut refname = vec![]; + let mut children: Vec<c::TextOrInlineElement> = vec![]; + let concrete = pair.into_inner().next().unwrap(); + match concrete.as_rule() { + Rule::reference_target => { + let rt_inner = concrete.into_inner().next().unwrap(); // reference_target_uq or target_name_qu + match rt_inner.as_rule() { + Rule::reference_target_uq => { + refid = None; + name = Some(rt_inner.as_str().into()); + refuri = None; + refname.push(rt_inner.as_str().into()); + children.push(rt_inner.as_str().into()); + }, + Rule::reference_target_qu => { + let (text, reference) = { + let mut text = None; + let mut reference = None; + for inner in rt_inner.clone().into_inner() { + match inner.as_rule() { + Rule::reference_text => text = Some(inner), + Rule::reference_bracketed => reference = Some(inner), + _ => unreachable!() + } + } + (text, reference) + }; + let trimmed_text = match (&text, &reference) { + (Some(text), None) => text.as_str(), + (_, Some(reference)) => { + text + .map(|text| text.as_str().trim_end_matches(|ch| " \n\r".contains(ch))) + .filter(|text| !text.is_empty()) + .unwrap_or_else(|| reference.clone().into_inner().next().unwrap().as_str()) + } + (None, None) => unreachable!() + }; + refid = None; + name = Some(trimmed_text.into()); + refuri = if let Some(reference) = reference { + let inner = reference.into_inner().next().unwrap(); + match inner.as_rule() { + // The URL rules in our parser accept a narrow superset of + // valid URLs, so we need to handle false positives. + Rule::url => if let Ok(target) = Url::parse_absolute(inner.as_str()) { + Some(target) + } else if inner.as_str().ends_with('_') { + // like target_name_qu (minus the final underscore) + let full_str = inner.as_str(); + refname.push(full_str[0..full_str.len() - 1].into()); + None + } else { + // like relative_reference + Some(Url::parse_relative(inner.as_str())?) + }, + Rule::target_name_qu => { + refname.push(inner.as_str().into()); + None + }, + Rule::relative_reference => { + Some(Url::parse_relative(inner.as_str())?) + }, + _ => unreachable!() + } + } else { + refname.push(trimmed_text.into()); + None + }; + children.push(trimmed_text.into()); + }, + _ => unreachable!() + } + }, + Rule::reference_explicit => unimplemented!("explicit reference"), + Rule::reference_auto => { + let rt_inner = concrete.into_inner().next().unwrap(); + match rt_inner.as_rule() { + Rule::url_auto => match Url::parse_absolute(rt_inner.as_str()) { + Ok(target) => { + refuri = Some(target); + name = None; + refid = None; + children.push(rt_inner.as_str().into()); + }, + // if our parser got a URL wrong, return it as a string + Err(_) => return Ok(rt_inner.as_str().into()) + }, + Rule::email => { + let mailto_url = String::from("mailto:") + rt_inner.as_str(); + match Url::parse_absolute(&mailto_url) { + Ok(target) => { + refuri = Some(target); + name = None; + refid = None; + children.push(rt_inner.as_str().into()); + }, + // if our parser got a URL wrong, return it as a string + Err(_) => return Ok(rt_inner.as_str().into()) + } + }, + _ => unreachable!() + } + }, + _ => unreachable!(), + }; + Ok(e::Reference::new( + Default::default(), + a::Reference { name, refuri, refid, refname }, + children + ).into()) +} + +fn convert_substitution_ref(pair: Pair<Rule>) -> Result<e::SubstitutionReference, Error> { + let name = whitespace_normalize_name(pair.as_str()); + Ok(a::ExtraAttributes::with_extra( + a::SubstitutionReference { + refname: vec![at::NameToken(name)] + } + )) +} diff --git a/parser/src/conversion/tests.rs b/parser/src/conversion/tests.rs new file mode 100644 index 0000000..89b0a1c --- /dev/null +++ b/parser/src/conversion/tests.rs @@ -0,0 +1,65 @@ +use document_tree::{ + elements as e, + element_categories as c, + HasChildren, +}; + +use crate::parse; + +fn ssubel_to_section(ssubel: &c::StructuralSubElement) -> &e::Section { + match ssubel { + c::StructuralSubElement::SubStructure(ref b) => match **b { + c::SubStructure::Section(ref s) => s, + ref c => panic!("Expected section, not {:?}", c), + }, + ref c => panic!("Expected SubStructure, not {:?}", c), + } +} + +const SECTIONS: &str = "\ +Intro before first section title + +Level 1 +******* + +------- +Level 2 +------- + +Level 3 +======= + +L1 again +******** + +L3 again, skipping L2 +===================== +"; + +#[test] +fn convert_skipped_section() { + let doctree = parse(SECTIONS).unwrap(); + let lvl0 = doctree.children(); + assert_eq!(lvl0.len(), 3, "Should be a paragraph and 2 sections: {:?}", lvl0); + + assert_eq!(lvl0[0], e::Paragraph::with_children(vec![ + "Intro before first section title".to_owned().into() + ]).into(), "The intro text should fit"); + + let lvl1a = ssubel_to_section(&lvl0[1]).children(); + assert_eq!(lvl1a.len(), 2, "The 1st lvl1 section should have (a title and) a single lvl2 section as child: {:?}", lvl1a); + //TODO: test title lvl1a[0] + let lvl2 = ssubel_to_section(&lvl1a[1]).children(); + assert_eq!(lvl2.len(), 2, "The lvl2 section should have (a title and) a single lvl3 section as child: {:?}", lvl2); + //TODO: test title lvl2[0] + let lvl3a = ssubel_to_section(&lvl2[1]).children(); + assert_eq!(lvl3a.len(), 1, "The 1st lvl3 section should just a title: {:?}", lvl3a); + //TODO: test title lvl3a[0] + + let lvl1b = ssubel_to_section(&lvl0[2]).children(); + assert_eq!(lvl1b.len(), 2, "The 2nd lvl1 section should have (a title and) a single lvl2 section as child: {:?}", lvl1b); + //TODO: test title lvl1b[0] + let lvl3b = ssubel_to_section(&lvl1b[1]).children(); + assert_eq!(lvl3b.len(), 1, "The 2nd lvl3 section should have just a title: {:?}", lvl3b); + //TODO: test title lvl3b[0] +} |
