diff options
| author | Philipp A | 2019-12-26 23:01:00 +0100 | 
|---|---|---|
| committer | Philipp A | 2019-12-26 23:36:48 +0100 | 
| commit | a0e3c53758d526bb418c068bce1c99fa5a597ed3 (patch) | |
| tree | e640238b011a9ea7806ccccaf1a435e4b371a376 /parser | |
| parent | 7018f5d3c42f18b6c83f398db9f1915361a7c679 (diff) | |
| download | rust-rst-a0e3c53758d526bb418c068bce1c99fa5a597ed3.tar.bz2 | |
Split into smaller crates
Diffstat (limited to 'parser')
| -rw-r--r-- | parser/Cargo.toml | 18 | ||||
| -rw-r--r-- | parser/src/conversion.rs | 96 | ||||
| -rw-r--r-- | parser/src/conversion/block.rs | 202 | ||||
| -rw-r--r-- | parser/src/conversion/inline.rs | 160 | ||||
| -rw-r--r-- | parser/src/conversion/tests.rs | 65 | ||||
| -rw-r--r-- | parser/src/lib.rs | 28 | ||||
| -rw-r--r-- | parser/src/pair_ext_parse.rs | 21 | ||||
| -rw-r--r-- | parser/src/pest_rst.rs | 7 | ||||
| -rw-r--r-- | parser/src/rst.pest | 474 | ||||
| -rw-r--r-- | parser/src/simplify.rs | 662 | ||||
| -rw-r--r-- | parser/src/tests.rs | 242 | ||||
| -rw-r--r-- | parser/src/token.rs | 16 | 
12 files changed, 1991 insertions, 0 deletions
| diff --git a/parser/Cargo.toml b/parser/Cargo.toml new file mode 100644 index 0000000..22f2490 --- /dev/null +++ b/parser/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = 'rst_parser' +version = '0.2.0' +authors = ['Philipp A. <flying-sheep@web.de>'] +edition = '2018' +description = 'a reStructuredText parser' +license = 'MIT OR Apache-2.0' + +documentation = 'https://flying-sheep.github.io/rust-rst' +homepage = 'https://github.com/flying-sheep/rust-rst' +repository = 'https://github.com/flying-sheep/rust-rst.git' + +[dependencies] +document_tree = { path = '../document_tree' } + +pest = '2.1.2' +pest_derive = '2.1.0' +failure = '0.1.6' diff --git a/parser/src/conversion.rs b/parser/src/conversion.rs new file mode 100644 index 0000000..de5f091 --- /dev/null +++ b/parser/src/conversion.rs @@ -0,0 +1,96 @@ +mod block; +mod inline; +#[cfg(test)] +mod tests; + +use failure::Error; +use pest::iterators::Pairs; + +use document_tree::{ +	Element,HasChildren, +	elements as e, +	element_categories as c, +	attribute_types as at, +}; + +use crate::pest_rst::Rule; + + +fn ssubel_to_section_unchecked_mut(ssubel: &mut c::StructuralSubElement) -> &mut e::Section { +	match ssubel { +		c::StructuralSubElement::SubStructure(ref mut b) => match **b { +			c::SubStructure::Section(ref mut s) => s, +			_ => unreachable!(), +		}, +		_ => unreachable!(), +	} +} + + +fn get_level<'tl>(toplevel: &'tl mut Vec<c::StructuralSubElement>, section_idxs: &[Option<usize>]) -> &'tl mut Vec<c::StructuralSubElement> { +	let mut level = toplevel; +	for maybe_i in section_idxs { +		if let Some(i) = *maybe_i { +			level = ssubel_to_section_unchecked_mut(&mut level[i]).children_mut(); +		} +	} +	level +} + + +pub fn convert_document(pairs: Pairs<Rule>) -> Result<e::Document, Error> { +	use self::block::TitleOrSsubel::*; +	 +	let mut toplevel: Vec<c::StructuralSubElement> = vec![]; +	// The kinds of section titles encountered. +	// `section_idx[x]` has the kind `kinds[x]`, but `kinds` can be longer +	let mut kinds: Vec<block::TitleKind> = vec![]; +	// Recursive indices into the tree, pointing at the active sections. +	// `None`s indicate skipped section levels: +	// toplevel[section_idxs.flatten()[0]].children[section_idxs.flatten()[1]]... +	let mut section_idxs: Vec<Option<usize>> = vec![]; +	 +	for pair in pairs { +		if let Some(ssubel) = block::convert_ssubel(pair)? { match ssubel { +			Title(title, kind) => { +				match kinds.iter().position(|k| k == &kind) { +					// Idx points to the level we want to add, +					// so idx-1 needs to be the last valid index. +					Some(idx) => { +						// If idx < len: Remove found section and all below +						section_idxs.truncate(idx); +						// If idx > len: Add None for skipped levels +						// TODO: test skipped levels +						while section_idxs.len() < idx { section_idxs.push(None) } +					}, +					None => kinds.push(kind), +				} +				let super_level = get_level(&mut toplevel, §ion_idxs); +				let slug = title.names().iter().next().map(|at::NameToken(name)| at::ID(name.to_owned())); +				let mut section = e::Section::with_children(vec![title.into()]); +				section.ids_mut().extend(slug.into_iter()); +				super_level.push(section.into()); +				section_idxs.push(Some(super_level.len() - 1)); +			}, +			Ssubel(elem) => get_level(&mut toplevel, §ion_idxs).push(elem), +		}} +	} +	Ok(e::Document::with_children(toplevel)) +} + +/// Normalizes a name in terms of whitespace. Equivalent to docutils's +/// `docutils.nodes.whitespace_normalize_name`. +pub fn whitespace_normalize_name(name: &str) -> String { +	// Python's string.split() defines whitespace differently than Rust does. +	let split_iter = name.split( +		|ch: char| ch.is_whitespace() || (ch >= '\x1C' && ch <= '\x1F') +	).filter(|split| !split.is_empty()); +	let mut ret = String::new(); +	for split in split_iter { +		if !ret.is_empty() { +			ret.push(' '); +		} +		ret.push_str(split); +	} +	ret +} diff --git a/parser/src/conversion/block.rs b/parser/src/conversion/block.rs new file mode 100644 index 0000000..ab18c48 --- /dev/null +++ b/parser/src/conversion/block.rs @@ -0,0 +1,202 @@ +use failure::{Error,bail}; +use pest::iterators::Pair; + +use document_tree::{ +	Element,HasChildren,ExtraAttributes, +	elements as e, +	element_categories as c, +	extra_attributes as a, +	attribute_types as at +}; + +use crate::{ +	pest_rst::Rule, +	pair_ext_parse::PairExt, +}; +use super::{whitespace_normalize_name, inline::convert_inlines}; + + +#[derive(PartialEq)] +pub(super) enum TitleKind { Double(char), Single(char) } + +pub(super) enum TitleOrSsubel { +	Title(e::Title, TitleKind), +	Ssubel(c::StructuralSubElement), +} + + +pub(super) fn convert_ssubel(pair: Pair<Rule>) -> Result<Option<TitleOrSsubel>, Error> { +	use self::TitleOrSsubel::*; +	Ok(Some(match pair.as_rule() { +		Rule::title => { let (t, k) = convert_title(pair)?; Title(t, k) }, +		//TODO: subtitle, decoration, docinfo +		Rule::EOI   => return Ok(None), +		_           => Ssubel(convert_substructure(pair)?.into()), +	})) +} + + +fn convert_substructure(pair: Pair<Rule>) -> Result<c::SubStructure, Error> { +	Ok(match pair.as_rule() { +		// todo: Topic, Sidebar, Transition +		// no section here, as it’s constructed from titles +		_ => convert_body_elem(pair)?.into(), +	}) +} + + +fn convert_body_elem(pair: Pair<Rule>) -> Result<c::BodyElement, Error> { +	Ok(match pair.as_rule() { +		Rule::paragraph        => convert_paragraph(pair)?.into(), +		Rule::target           => convert_target(pair)?.into(), +		Rule::substitution_def => convert_substitution_def(pair)?.into(), +		Rule::admonition_gen   => convert_admonition_gen(pair)?.into(), +		Rule::image            => convert_image::<e::Image>(pair)?.into(), +		Rule::bullet_list      => convert_bullet_list(pair)?.into(), +		rule => unimplemented!("unhandled rule {:?}", rule), +	}) +} + + +fn convert_title(pair: Pair<Rule>) -> Result<(e::Title, TitleKind), Error> { +	let mut title: Option<String> = None; +	let mut title_inlines: Option<Vec<c::TextOrInlineElement>> = None; +	let mut adornment_char: Option<char> = None; +	// title_double or title_single. Extract kind before consuming +	let inner_pair = pair.into_inner().next().unwrap(); +	let kind = inner_pair.as_rule(); +	for p in inner_pair.into_inner() { +		match p.as_rule() { +			Rule::line => { +				title = Some(p.as_str().to_owned()); +				title_inlines = Some(convert_inlines(p)?); +			}, +			Rule::adornments => adornment_char = Some(p.as_str().chars().next().expect("Empty adornment?")), +			rule => unimplemented!("Unexpected rule in title: {:?}", rule), +		}; +	} +	// now we encountered one line of text and one of adornments +	// TODO: emit error if the adornment line is too short (has to match title length) +	let mut elem = e::Title::with_children(title_inlines.expect("No text in title")); +	if let Some(title) = title { +		//TODO: slugify properly +		let slug =  title.to_lowercase().replace("\n", "").replace(" ", "-"); +		elem.names_mut().push(at::NameToken(slug)); +	} +	let title_kind = match kind { +		Rule::title_double => TitleKind::Double(adornment_char.unwrap()), +		Rule::title_single => TitleKind::Single(adornment_char.unwrap()), +		_ => unreachable!(), +	}; +	Ok((elem, title_kind)) +} + + +fn convert_paragraph(pair: Pair<Rule>) -> Result<e::Paragraph, Error> { +	Ok(e::Paragraph::with_children(convert_inlines(pair)?)) +} + + +fn convert_target(pair: Pair<Rule>) -> Result<e::Target, Error> { +	let mut elem: e::Target = Default::default(); +	elem.extra_mut().anonymous = false; +	for p in pair.into_inner() { +		match p.as_rule() { +			Rule::target_name_uq | Rule::target_name_qu => { +				elem.ids_mut().push(p.as_str().into()); +				elem.names_mut().push(p.as_str().into()); +			}, +			// TODO: also handle non-urls +			Rule::link_target => elem.extra_mut().refuri = Some(p.parse()?), +			rule => panic!("Unexpected rule in target: {:?}", rule), +		} +	} +	Ok(elem) +} + +fn convert_substitution_def(pair: Pair<Rule>) -> Result<e::SubstitutionDefinition, Error> { +	let mut pairs = pair.into_inner(); +	let name = whitespace_normalize_name(pairs.next().unwrap().as_str());  // Rule::substitution_name +	let inner_pair = pairs.next().unwrap(); +	let inner: Vec<c::TextOrInlineElement> = match inner_pair.as_rule() { +		Rule::replace => convert_replace(inner_pair)?, +		Rule::image   => vec![convert_image::<e::ImageInline>(inner_pair)?.into()], +		rule => panic!("Unknown substitution rule {:?}", rule), +	}; +	let mut subst_def = e::SubstitutionDefinition::with_children(inner); +	subst_def.names_mut().push(at::NameToken(name)); +	Ok(subst_def) +} + +fn convert_replace(pair: Pair<Rule>) -> Result<Vec<c::TextOrInlineElement>, Error> { +	let mut pairs = pair.into_inner(); +	let paragraph = pairs.next().unwrap(); +	convert_inlines(paragraph) +}  + +fn convert_image<I>(pair: Pair<Rule>) -> Result<I, Error> where I: Element + ExtraAttributes<a::Image> { +	let mut pairs = pair.into_inner(); +	let mut image = I::with_extra(a::Image::new( +		pairs.next().unwrap().as_str().trim().parse()?,  // line +	)); +	for opt in pairs { +		let mut opt_iter = opt.into_inner(); +		let opt_name = opt_iter.next().unwrap(); +		let opt_val = opt_iter.next().unwrap(); +		match opt_name.as_str() { +			"class"  => image.classes_mut().push(opt_val.as_str().to_owned()), +			"name"   => image.names_mut().push(opt_val.as_str().into()), +			"alt"    => image.extra_mut().alt    = Some(opt_val.as_str().to_owned()), +			"height" => image.extra_mut().height = Some(opt_val.parse()?), +			"width"  => image.extra_mut().width  = Some(opt_val.parse()?), +			"scale"  => image.extra_mut().scale  = Some(parse_scale(&opt_val)?), +			"align"  => image.extra_mut().align  = Some(opt_val.parse()?), +			"target" => image.extra_mut().target = Some(opt_val.parse()?), +			name => bail!("Unknown Image option {}", name), +		} +	} +	Ok(image) +} + +fn parse_scale(pair: &Pair<Rule>) -> Result<u8, Error> { +	let input = if pair.as_str().chars().rev().next() == Some('%') { &pair.as_str()[..pair.as_str().len()-1] } else { pair.as_str() }; +	use pest::error::{Error,ErrorVariant}; +	Ok(input.parse().map_err(|e: std::num::ParseIntError| { +		let var: ErrorVariant<Rule> = ErrorVariant::CustomError { message: e.to_string() }; +		Error::new_from_span(var, pair.as_span()) +	})?) +} + +fn convert_admonition_gen(pair: Pair<Rule>) -> Result<c::BodyElement, Error> { +	let mut iter = pair.into_inner(); +	let typ = iter.next().unwrap().as_str(); +	// TODO: in reality it contains body elements. +	let children: Vec<c::BodyElement> = iter.map(|p| e::Paragraph::with_children(vec![p.as_str().into()]).into()).collect(); +	Ok(match typ { +		"attention" => e::Attention::with_children(children).into(), +		"hint"      =>      e::Hint::with_children(children).into(), +		"note"      =>      e::Note::with_children(children).into(), +		"caution"   =>   e::Caution::with_children(children).into(), +		"danger"    =>    e::Danger::with_children(children).into(), +		"error"     =>     e::Error::with_children(children).into(), +		"important" => e::Important::with_children(children).into(), +		"tip"       =>       e::Tip::with_children(children).into(), +		"warning"   =>   e::Warning::with_children(children).into(), +		typ         => panic!("Unknown admontion type {}!", typ), +	}) +} + +fn convert_bullet_list(pair: Pair<Rule>) -> Result<e::BulletList, Error> { +	Ok(e::BulletList::with_children(pair.into_inner().map(convert_bullet_item).collect::<Result<_, _>>()?)) +} + +fn convert_bullet_item(pair: Pair<Rule>) -> Result<e::ListItem, Error> { +	let mut iter = pair.into_inner(); +	let mut children: Vec<c::BodyElement> = vec![ +		convert_paragraph(iter.next().unwrap())?.into() +	]; +	for p in iter { +		children.push(convert_body_elem(p)?); +	} +	Ok(e::ListItem::with_children(children)) +} diff --git a/parser/src/conversion/inline.rs b/parser/src/conversion/inline.rs new file mode 100644 index 0000000..6094714 --- /dev/null +++ b/parser/src/conversion/inline.rs @@ -0,0 +1,160 @@ +use failure::Error; +use pest::iterators::Pair; + +use document_tree::{ +	HasChildren, +	elements as e, +	url::Url, +	element_categories as c, +	extra_attributes as a, +	attribute_types as at, +}; + +use crate::{ +	pest_rst::Rule, +//    pair_ext_parse::PairExt, +}; +use super::whitespace_normalize_name; + + +pub fn convert_inline(pair: Pair<Rule>) -> Result<c::TextOrInlineElement, Error> { +	Ok(match pair.as_rule() { +		Rule::str | Rule::str_nested => pair.as_str().into(), +		Rule::ws_newline        => " ".to_owned().into(), +		Rule::reference         => convert_reference(pair)?, +		Rule::substitution_name => convert_substitution_ref(pair)?.into(), +		Rule::emph              => e::Emphasis::with_children(convert_inlines(pair)?).into(), +		Rule::strong            => e::Strong::with_children(convert_inlines(pair)?).into(), +		Rule::literal           => e::Literal::with_children(convert_inlines(pair)?).into(), +		rule => unimplemented!("unknown rule {:?}", rule), +	}) +} + +pub fn convert_inlines(pair: Pair<Rule>) -> Result<Vec<c::TextOrInlineElement>, Error> { +	pair.into_inner().map(convert_inline).collect() +} + +fn convert_reference(pair: Pair<Rule>) -> Result<c::TextOrInlineElement, Error> { +	let name; +	let refuri; +	let refid; +	let mut refname = vec![]; +	let mut children: Vec<c::TextOrInlineElement> = vec![]; +	let concrete = pair.into_inner().next().unwrap(); +	match concrete.as_rule() { +		Rule::reference_target => { +			let rt_inner = concrete.into_inner().next().unwrap(); // reference_target_uq or target_name_qu +			match rt_inner.as_rule() { +				Rule::reference_target_uq => { +					refid  = None; +					name   = Some(rt_inner.as_str().into()); +					refuri = None; +					refname.push(rt_inner.as_str().into()); +					children.push(rt_inner.as_str().into()); +				}, +				Rule::reference_target_qu => { +					let (text, reference) = { +						let mut text = None; +						let mut reference = None; +						for inner in rt_inner.clone().into_inner() { +							match inner.as_rule() { +								Rule::reference_text => text = Some(inner), +								Rule::reference_bracketed => reference = Some(inner), +								_ => unreachable!() +							} +						} +						(text, reference) +					}; +					let trimmed_text = match (&text, &reference) { +						(Some(text), None) => text.as_str(), +						(_, Some(reference)) => { +							text +								.map(|text| text.as_str().trim_end_matches(|ch| " \n\r".contains(ch))) +								.filter(|text| !text.is_empty()) +								.unwrap_or_else(|| reference.clone().into_inner().next().unwrap().as_str()) +						} +						(None, None) => unreachable!() +					}; +					refid = None; +					name = Some(trimmed_text.into()); +					refuri = if let Some(reference) = reference { +						let inner = reference.into_inner().next().unwrap(); +						match inner.as_rule() { +							// The URL rules in our parser accept a narrow superset of +							// valid URLs, so we need to handle false positives. +							Rule::url => if let Ok(target) = Url::parse_absolute(inner.as_str()) { +								Some(target) +							} else if inner.as_str().ends_with('_') { +								// like target_name_qu (minus the final underscore) +								let full_str = inner.as_str(); +								refname.push(full_str[0..full_str.len() - 1].into()); +								None +							} else { +								// like relative_reference +								Some(Url::parse_relative(inner.as_str())?) +							}, +							Rule::target_name_qu => { +								refname.push(inner.as_str().into()); +								None +							}, +							Rule::relative_reference => { +								Some(Url::parse_relative(inner.as_str())?) +							}, +							_ => unreachable!() +						} +					} else { +						refname.push(trimmed_text.into()); +						None +					}; +					children.push(trimmed_text.into()); +				}, +				_ => unreachable!() +			} +		}, +		Rule::reference_explicit => unimplemented!("explicit reference"), +		Rule::reference_auto => { +			let rt_inner = concrete.into_inner().next().unwrap(); +			match rt_inner.as_rule() { +				Rule::url_auto => match Url::parse_absolute(rt_inner.as_str()) { +					Ok(target) => { +						refuri = Some(target); +						name   = None; +						refid  = None; +						children.push(rt_inner.as_str().into()); +					}, +					// if our parser got a URL wrong, return it as a string +					Err(_) => return Ok(rt_inner.as_str().into()) +				}, +				Rule::email => { +					let mailto_url = String::from("mailto:") + rt_inner.as_str(); +					match Url::parse_absolute(&mailto_url) { +						Ok(target) => { +							refuri = Some(target); +							name   = None; +							refid  = None; +							children.push(rt_inner.as_str().into()); +						}, +						// if our parser got a URL wrong, return it as a string +						Err(_) => return Ok(rt_inner.as_str().into()) +					} +				}, +				_ => unreachable!() +			} +		}, +		_ => unreachable!(), +	}; +	Ok(e::Reference::new( +		Default::default(), +		a::Reference { name, refuri, refid, refname }, +		children +	).into()) +} + +fn convert_substitution_ref(pair: Pair<Rule>) -> Result<e::SubstitutionReference, Error> { +	let name = whitespace_normalize_name(pair.as_str()); +	Ok(a::ExtraAttributes::with_extra( +		a::SubstitutionReference { +			refname: vec![at::NameToken(name)] +		} +	)) +} diff --git a/parser/src/conversion/tests.rs b/parser/src/conversion/tests.rs new file mode 100644 index 0000000..89b0a1c --- /dev/null +++ b/parser/src/conversion/tests.rs @@ -0,0 +1,65 @@ +use document_tree::{ +	elements as e, +	element_categories as c, +	HasChildren, +}; + +use crate::parse; + +fn ssubel_to_section(ssubel: &c::StructuralSubElement) -> &e::Section { +	match ssubel { +		c::StructuralSubElement::SubStructure(ref b) => match **b { +			c::SubStructure::Section(ref s) => s, +			ref c => panic!("Expected section, not {:?}", c), +		}, +		ref c => panic!("Expected SubStructure, not {:?}", c), +	} +} + +const SECTIONS: &str = "\ +Intro before first section title + +Level 1 +******* + +------- +Level 2 +------- + +Level 3 +======= + +L1 again +******** + +L3 again, skipping L2 +===================== +"; + +#[test] +fn convert_skipped_section() { +	let doctree = parse(SECTIONS).unwrap(); +	let lvl0 = doctree.children(); +	assert_eq!(lvl0.len(), 3, "Should be a paragraph and 2 sections: {:?}", lvl0); +	 +	assert_eq!(lvl0[0], e::Paragraph::with_children(vec![ +		"Intro before first section title".to_owned().into() +	]).into(), "The intro text should fit"); +	 +	let lvl1a = ssubel_to_section(&lvl0[1]).children(); +	assert_eq!(lvl1a.len(), 2, "The 1st lvl1 section should have (a title and) a single lvl2 section as child: {:?}", lvl1a); +	//TODO: test title lvl1a[0] +	let lvl2  = ssubel_to_section(&lvl1a[1]).children(); +	assert_eq!(lvl2.len(), 2, "The lvl2 section should have (a title and) a single lvl3 section as child: {:?}", lvl2); +	//TODO: test title lvl2[0] +	let lvl3a = ssubel_to_section(&lvl2[1]).children(); +	assert_eq!(lvl3a.len(), 1, "The 1st lvl3 section should just a title: {:?}", lvl3a); +	//TODO: test title lvl3a[0] +	 +	let lvl1b = ssubel_to_section(&lvl0[2]).children(); +	assert_eq!(lvl1b.len(), 2, "The 2nd lvl1 section should have (a title and) a single lvl2 section as child: {:?}", lvl1b); +	//TODO: test title lvl1b[0] +	let lvl3b = ssubel_to_section(&lvl1b[1]).children(); +	assert_eq!(lvl3b.len(), 1, "The 2nd lvl3 section should have just a title: {:?}", lvl3b); +	//TODO: test title lvl3b[0] +} diff --git a/parser/src/lib.rs b/parser/src/lib.rs new file mode 100644 index 0000000..23e97c7 --- /dev/null +++ b/parser/src/lib.rs @@ -0,0 +1,28 @@ +pub mod token; +mod conversion; +mod simplify; +mod pest_rst; +mod pair_ext_parse; +#[cfg(test)] +pub mod tests; + +use failure::Error; +use pest::Parser; + +use document_tree::Document; + +use self::pest_rst::{RstParser,Rule}; +use self::conversion::convert_document; +use self::simplify::resolve_references; + + +/// Parse into a document tree and resolve sections, but not references. +pub fn parse_only(source: &str) -> Result<Document, Error> { +	let pairs = RstParser::parse(Rule::document, source)?; +	convert_document(pairs) +} + +/// Parse into a document tree and resolve sections and references.  +pub fn parse(source: &str) -> Result<Document, Error> { +	parse_only(source).map(resolve_references) +} diff --git a/parser/src/pair_ext_parse.rs b/parser/src/pair_ext_parse.rs new file mode 100644 index 0000000..a04b3dd --- /dev/null +++ b/parser/src/pair_ext_parse.rs @@ -0,0 +1,21 @@ +use std::str::FromStr; + +use pest::Span; +use pest::iterators::Pair; +use pest::error::{Error,ErrorVariant}; + + +pub trait PairExt<R> where R: pest::RuleType { +	fn parse<T, E>(&self) -> Result<T, Error<R>> where T: FromStr<Err = E>, E: ToString; +} + +impl<'l, R> PairExt<R> for Pair<'l, R> where R: pest::RuleType { +	fn parse<T, E>(&self) -> Result<T, Error<R>> where T: FromStr<Err = E>, E: ToString { +		self.as_str().parse().map_err(|e| to_parse_error(self.as_span(), &e)) +	} +} + +pub(crate) fn to_parse_error<E, R>(span: Span, e: &E) -> Error<R> where E: ToString, R: pest::RuleType { +	let var: ErrorVariant<R> = ErrorVariant::CustomError { message: e.to_string() }; +	Error::new_from_span(var, span) +} diff --git a/parser/src/pest_rst.rs b/parser/src/pest_rst.rs new file mode 100644 index 0000000..74199a8 --- /dev/null +++ b/parser/src/pest_rst.rs @@ -0,0 +1,7 @@ +#![allow(clippy::redundant_closure)] + +use pest_derive::Parser; + +#[derive(Parser)] +#[grammar = "rst.pest"] +pub struct RstParser; diff --git a/parser/src/rst.pest b/parser/src/rst.pest new file mode 100644 index 0000000..f3a1516 --- /dev/null +++ b/parser/src/rst.pest @@ -0,0 +1,474 @@ +// Entry point: the document. + +// This grammar is aligned to the doctree names when possible. +// It will however contain blocks, as we can’t parse sections: +// Section headers define the hierarchy by their delimiters, +// and pest only has one stack that we need for indentation. + +document = _{ SOI ~ blocks ~ EOI } +blocks   = _{ block ~ (blank_line* ~ block)* ~ blank_line? } +block    = _{ PEEK[..] ~ hanging_block } + +// This is the list of all block-level elements +// They’re defined hanging, i.e. without the first PEEK[..] +// This is d +hanging_block = _{ +    substitution_def +    | image_directive +    | admonition +    | admonition_gen +    | target +    | title +    | bullet_list +    | paragraph +// TODO: implement all those things: +// | block_quote +// | verbatim +// | image ✓ +// | code_block +// | doctest_block +// | admonition ✓ +// | target ✓ +// | horizontal_rule +// | title ✓ +// | table +// | ordered_list +// | bullet_list ✓ +// | paragraph ✓ +// | plain +} + +// Substitution definition. A block type +substitution_def  =  { ".." ~ PUSH(" "+) ~ "|" ~ substitution_name ~ "|" ~ " "+ ~ inline_dirblock ~ DROP } +substitution_name =  { !" " ~ (!(" "|"|") ~ ANY)+ ~ (" "+ ~ (!(" "|"|") ~ ANY)+)* } +inline_dirblock   = _{ replace | image }  // TODO: implement others + +// Target. A block type +target         =  { target_qu | target_uq } +target_uq      = _{ ".. _"  ~         target_name_uq ~           ":" ~ (" " ~ link_target)? ~ " "* ~ NEWLINE } +target_qu      = _{ ".. _`" ~ !"``" ~ target_name_qu ~ !"``:" ~ "`:" ~ (" " ~ link_target)? ~ " "* ~ NEWLINE } +target_name_uq =  { ( !("_"|":"|"`") ~ !NEWLINE ~ ANY )* } +target_name_qu =  { ( !(":"|"`"|"_>") ~ ANY )* } +link_target    =  { nonspacechar+ } + +// Title. A block type +title = { title_double | title_single } +title_double = { PUSH(adornments) ~ NEWLINE ~ PEEK[..-1] ~ " "* ~ line ~ PEEK[..-1] ~ POP } +title_single = { line ~ PEEK[..] ~ adornments ~ NEWLINE } + +// Bullet list. A block type. +bullet_list =  { bullet_item ~ (PEEK[..] ~ bullet_item)* } +bullet_item =  { bullet_marker ~ PUSH(" "+) ~ line ~ blank_line* ~ blist_body? ~ DROP } +blist_body  = _{ PEEK[..-1] ~ PUSH(" " ~ POP) ~ hanging_block ~ block* } + +// paragraph. A block type. +paragraph =  { inlines } + + +/* Directives: http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#directives + * .. name:: arguments ~ :options: ~ blank_line+ ~ content + * Everything except for the first argument has to be indented + */ + + +// Directives with options can have these or specific ones: +common_opt_name = { "class" | "name" } + +// Replace. A directive only usable in substitutions. + +replace = { ^"replace::" ~ " "* ~ paragraph } + +// Image. A directive. + +image_directive = _{ ".." ~ PUSH(" "+) ~ image ~ DROP } +image           =  { ^"image::" ~ line ~ image_opt_block? } +image_opt_block = _{ PEEK[..-1] ~ PUSH("  " ~ POP) ~ image_option } //TODO: merge with other directives? +image_option    =  { ":" ~ image_opt_name ~ ":" ~ line } +image_opt_name  =  { common_opt_name | "alt" | "height" | "width" | "scale" | "align" | "target" } + +// Admonition. A directive. The generic one has a title + +admonition         =  { ".." ~ PUSH(" "+) ~ ^"admonition::"         ~               line  ~ blank_line* ~ admonition_content? ~ DROP } +admonition_gen     =  { ".." ~ PUSH(" "+) ~  admonition_type ~ "::" ~ (blank_line | line) ~ blank_line* ~ admonition_content? ~ DROP } +admonition_type    =  { ^"attention" | ^"caution" | ^"danger" | ^"error" | ^"hint" | ^"important" | ^"note" | ^"tip" | ^"warning" } +admonition_content = _{ PEEK[..-1] ~ PUSH("  " ~ POP) ~ hanging_block ~ block* } //TODO: merge with other directives? + + + +/* + * inlines + */ + + +line       =  { !marker ~ inline+ ~ NEWLINE } +blank_line = _{ !marker ~ !inline ~ " "* ~ NEWLINE } + +inlines    = _{ !marker ~ inline+ ~ ( ( ws_newline ~ PEEK[..] ~ !marker ~ inline+ )+ ~ NEWLINE )? } +ws_newline =  { NEWLINE } +inline     = _{ inline_special | str } +inline_special = _{ +    reference +    | substitution_ref +    | emph_outer +    | strong_outer +    | literal_outer +//     | ul_or_star_line +//     | space +//     | note_reference +//     | footnote +//     //| citation +//     | code +//     | application_depent +//     | entity +//     | escaped_char +//     | smart +//     | symbol +} + +str = { (!(NEWLINE | inline_special) ~ ANY)+ } + +// simple formatting +inline_nested = _{ inline_special | str_nested } +str_nested    =  { word_nested ~ ( " "+ ~ word_nested)* } +// TODO: allow ` in emph and * in literal +word_nested   = _{ (!(NEWLINE | " " | inline_special | "*" | "`") ~ ANY)+ } + +emph_outer    = _{ "*" ~ emph ~ "*" } +emph          =  { (!("*"|" ") ~ inline_nested)+ ~ (" "+ ~ (!("*"|" ") ~ inline_nested)+)* } +strong_outer  = _{ "**" ~ strong ~ "**" } +strong        =  { (!("*"|" ") ~ inline_nested)+ ~ (" "+ ~ (!("*"|" ") ~ inline_nested)+)* } +literal_outer = _{ "``" ~ literal ~ "``" } +literal       =  { (!("`"|" ") ~ inline_nested)+ ~ (" "+ ~ (!("`"|" ") ~ inline_nested)+)* } + +// inline links +reference = { reference_target | reference_explicit | reference_auto } + +reference_target = { reference_target_uq ~ "_" | reference_target_qu } +reference_target_uq =  { (!("_"|":"|"`") ~ nonspacechar)+ } +reference_target_qu = { ( !("`"? ~ "`_") ~ "`" ~ !"``" ) ~ reference_text? ~ ("<" ~ reference_bracketed ~ ">")? ~ ( "`" ~ !"``" ) ~ "_" } +reference_text = { !"<" ~ ( !("`"|"<") ~ ANY )+ } +reference_bracketed = { url | (target_name_qu ~ "_") | relative_reference } +relative_reference = { (!("`"|">") ~ ANY)+ } + +reference_explicit = { reference_label ~ "(" ~ " "* ~ reference_source ~ " "* ~ (NEWLINE ~ PEEK[..])? ~ reference_title ~ " "* ~ ")" } +reference_label = { "[" ~ !"^" ~ (!"]" ~ inline)* ~ "]" } +reference_source          =  { reference_source_contents } +reference_source_contents = _{ ( (!("("|")"|">") ~ nonspacechar)+ | "(" ~ reference_source_contents ~ ")" )* } +reference_title        = { ( reference_title_single | reference_title_double | "" ) } +reference_title_single = { "'"  ~ ( !("'"  ~ " "+ ~ (")" | NEWLINE)) ~ ANY )* ~ "'" } +reference_title_double = { "\"" ~ ( !("\"" ~ " "+ ~ (")" | NEWLINE)) ~ ANY )* ~ "\"" } + +// Emails can't end with punctuation, but URLs must use a separate rule. +reference_auto = { url_auto | email } +//reference_embedded = { "`" ~ reference_embedded_source ~ "<" ~ absolute_url_with_fragment ~ ">`_" ~ "_"? } +//reference_embedded_source = { ( !("<"|":"|"`") ~ ( " " | nonspacechar | blank_line ) )* } + +substitution_ref = _{ "|" ~ substitution_name ~ "|" } + +/* URLs as defined by the WHATWG URL standard. */ +url = { absolute_url_no_query ~ ("?" ~ url_unit*)? ~ ("#" ~ url_unit*)? } +absolute_url_no_query = { +    ( special_url_scheme ~ ":" ~ scheme_relative_special_url ) | +    ( ^"file:" ~ scheme_relative_file_url ) | +    ( arbitrary_scheme ~ ":" ~ relative_url ) +} +scheme_relative_special_url = { "//" ~ host ~ (":" ~ url_port)? ~ path_absolute_url? } +path_absolute_url = { "/" ~ path_relative_url } +path_relative_url = { ( url_path_segment_unit* ~ "/" )* ~ url_path_segment_unit* } +url_path_segment_unit = { !("/"|"?") ~ url_unit } +url_port = { ASCII_DIGIT* } +scheme_relative_file_url = { "//" ~ ( host ~ !("/:/"|"/|/") )? ~ path_absolute_url } +relative_url = { ( "//" ~ host ~ (":" ~ url_port)? ~ path_absolute_url? ) | path_absolute_url | (!(arbitrary_scheme ~ ":") ~ path_relative_url) } +/* this is approximately a superset of valid hosts and opaque hosts */ +host = { ( !(":"|"/"|"?"|"#") ~ url_unit)+ | ("["~(ASCII_HEX_DIGIT|"."|":")+~"]") } +special_url_scheme = { ^"ftp" | (^"http" | ^"ws") ~ ^"s"? }  /* doesn't include "file" */ +arbitrary_scheme = { ASCII_ALPHA ~ ASCII_ALPHANUMERIC* } +url_unit = { +	ASCII_ALPHANUMERIC | +	"!"|"$"|"&"|"'"|"("|")"|"*"|"+"|","|"-"|"."|"/"|":"|";"|"="|"?"|"@"|"_"|"~" | +	(!(SURROGATE|NONCHARACTER_CODE_POINT) ~ '\u{A0}'..'\u{10FFFD}') | +	("%" ~ ASCII_HEX_DIGIT{2}) +} + +/* + * Rules for URLs that don't end in punctuation. + * This is a modification of the rules above to incorporate the docutils rules + * for the final character in an auto URL and for the character after it. + * The patterns used here to emulate the behavior of docutils' regex are taken + * from <http://www.inf.puc-rio.br/~roberto/docs/ry10-01.pdf>. + */ +url_auto = { +    ( absolute_url_no_query ~ ("?" ~ url_unit*)? ~ "#" ~ url_units_auto ) | +    ( absolute_url_no_query ~ "?" ~ url_units_auto ) | +    ( special_url_scheme ~ "://" ~ host ~ (":" ~ url_port)? ~ path_absolute_url_auto ) | +    ( special_url_scheme ~ "://" ~ host ~ ":" ~ url_port ~ &follows_auto_url ) | +    ( special_url_scheme ~ "://" ~ ( domain_host_auto | "["~(ASCII_HEX_DIGIT|"."|":")+~"]" ~ &follows_auto_url ) ) | +    ( ^"file://" ~ ( host ~ !("/:/"|"/|/") )? ~ path_absolute_url_auto ) | +    ( arbitrary_scheme ~ ":" ~ relative_url_auto ) +} +domain_host_auto = { +    ( !(":"|"/"|"?"|"#") ~ url_unit ~ url_units_auto ) | +    ( !(":"|"/"|"?"|"#") ~ url_unit ~ &">" ) | +    ( (ASCII_ALPHANUMERIC|"_"|"~"|"*"|"/"|"="|"+") ~ &follows_auto_url ) +} +path_absolute_url_auto = { "/" ~ path_relative_url_auto } +path_relative_url_auto = { prua1 | prua2 | &follows_auto_url } +prua1 = { ( url_path_segment_unit ~ prua1 ) | ( "/" ~ path_relative_url_auto ) } +prua2 = { ( url_path_segment_unit ~ prua2 ) | ( (ASCII_ALPHANUMERIC|"_"|"~"|"*"|"="|"+") ~ &follows_auto_url ) } +relative_url_auto = { +    ( "//" ~ host ~ (":" ~ url_port)? ~ path_absolute_url_auto ) | +    ( "//" ~ host ~ ":" ~ url_port ~ &follows_auto_url ) | +    ( "//" ~ ( domain_host_auto | "["~(ASCII_HEX_DIGIT|"."|":")+~"]" ~ &follows_auto_url ) ) | +    path_absolute_url_auto | +    // (prua1|prua2) is path_relative_url_auto minus the &follows_auto_url case +    (!(arbitrary_scheme ~ ":") ~ (prua1 | prua2)) +} +url_units_auto = { +    ( url_unit ~ url_units_auto ) | +    ( url_unit ~ &">" ~ &follows_auto_url ) | +    ( (ASCII_ALPHANUMERIC|"_"|"~"|"*"|"/"|"="|"+") ~ &follows_auto_url ) +} +follows_auto_url = { +    EOI|"\x00"|WHITE_SPACE|">"|"\u{201A}"|"\u{201E}"| +    (!(CONNECTOR_PUNCTUATION|OPEN_PUNCTUATION|"#"|"%"|"&"|"*"|"@") ~ PUNCTUATION) +} + +/* Rules for emails as defined by the HTML standard */ +email = { ( email_atext | "." )+ ~ "@" ~ email_label ~ ( "." ~ email_label )* } +email_atext = { ASCII_ALPHANUMERIC|"!"|"#"|"$"|"%"|"&"|"'"|"/"|"="|"?"|"^"|"_"|"`"|"{"|"|"|"}"|"~" } +email_label = { ASCII_ALPHANUMERIC ~ ( !("-"+ ~ !ASCII_ALPHANUMERIC) ~ (ASCII_ALPHANUMERIC|"-") ){0,62} } + +/* + * character classes + */ + + +bullet_marker = _{ "+" | "*" | "-" } +adornments = { +    // recommended +    "="+ | "-"+ | "`"+ | ":"+ | "."+ | "'"+ | "\""+ | "~"+ | "^"+ | "_"+ | "*"+ | "+"+ | "#"+ | +    // parentheses +    "("+ | ")"+ | "["+ |  "]"+ | "{"+ | "}"+ | +    // punctuation +    ","+ | ";"+ | "!"+ | "?"+ | +    // operators +    "&"+ | "|"+ | "/"+ | "%"+ | "<"+ | ">"+ | +    // misc +    "$"+ | "@"+ | "\\"+ +} +nonspacechar = _{ !(" " | NEWLINE) ~ ANY } + + +/* + * lookaheads. do not use in another position + */ + + +marker = _{ (bullet_marker | "..") ~ " " } + + + +//################################################################################# + + + +// code_block = { +//     ".. code" ~ "-block"? ~ ":: " ~ source ~ blank_line ~ +//     NEWLINE ~ verbatim_chunk+ +// } + +// doctest_block = { (doctest_line+ ~ (!(">" | blank_line) ~ line)*)+ } + +// block_quote_raw = { ":" ~ blank_line ~ NEWLINE ~ nonblank_indented_line+ } + +// block_quote_chunk = { +//     !"::" ~ ":" ~ blank_line ~ +//     NEWLINE ~ +//     blank_line* ~ +//     nonblank_indented_line+ +// } + +// block_quote = { block_quote_chunk+ } + +// nonblank_indented_line = { !blank_line ~ indented_line } + +// verbatim_chunk = { blank_line* ~ nonblank_indented_line+ } + +// verbatim = { verbatim_chunk+ } + +// horizontal_rule = { +//     ( "=" ~ sp ~ "=" ~ sp ~ "=" ~ (sp ~ "=")* +//     | "-" ~ sp ~ "-" ~ sp ~ "-" ~ (sp ~ "-")* +//     | "*" ~ sp ~ "*" ~ sp ~ "*" ~ (sp ~ "*")* +//     | "^" ~ sp ~ "^" ~ sp ~ "^" ~ (sp ~ "^")* +//     | "~" ~ sp ~ "~" ~ sp ~ "~" ~ (sp ~ "~")* +//     | "_" ~ sp ~ "_" ~ sp ~ "_" ~ (sp ~ "_")* +//     ) ~ +//     sp ~ NEWLINE ~ blank_line+ +// } + +// table = { grid_table | header_less_grid_table | simple_table } + +// simple_table = { "NotImplemented" ~ "simple_table" } + +// grid_table = { grid_table_header ~ grid_table_header_sep ~ grid_table_body+ } +// header_less_grid_table = { grid_table_sep ~ grid_table_body+ } +// grid_table_header = { sp ~ "+" ~ ( "-"+ ~ "+" )+ ~ blank_line ~ grid_table_row+ } +// grid_table_body = { ( grid_table_row ~ grid_table_sep )+ } +// grid_table_row = { sp ~ "|" ~ sp ~ ( table_cell ~ sp ~ "|" )+ ~ blank_line } +// table_cell = { ( ":" | ">" | "<" | "/" | "-" | spacechar | escaped_char | alphanumeric )+ } +// grid_table_header_sep = { sp ~ "+" ~ ( "="+ ~ "+" )+ ~ blank_line } +// grid_table_sep = { sp ~ "+" ~ ( "-"+ ~ "+" )+ ~ blank_line } + +// bullet = { !horizontal_rule ~ ("+" | "*" | "-") ~ spacechar+ } + +// bullet_list = { &bullet ~ (list_tight | list_loose) } + +// list_tight = { list_item_tight+ ~ blank_line* ~ !(bullet | enumerator | def_marker) } +// list_loose = { ( list_item ~ blank_line* )+ } + +// list_item = { (bullet | enumerator | def_marker) ~ list_block ~ list_continuation_block* } +// list_item_tight = { +//     (bullet | enumerator | def_marker) ~ +//     list_block ~ +//     (!blank_line ~ list_continuation_block)* ~ +//     !list_continuation_block +// } + +// list_block = { !blank_line ~ line ~ list_block_line* } + +// list_continuation_block = { blank_line* ~ ( indent ~ list_block )+ } + +// enumerator = { (ASCII_DIGIT+ | "#"+) ~ "." ~ spacechar+ } + +// ordered_list = { &enumerator ~ (list_tight | list_loose) } + +// list_block_line = { +//     !blank_line ~ +//     !( (indent? ~ (bullet | enumerator)) | def_marker ) ~ +//     !horizontal_rule ~ +//     optionally_indented_line +// } + + + +// space = _{ spacechar+ } + +// str = { normal_char+ ~ str_chunk* } +// str_chunk = _{ (normal_char | "_"+ ~ &alphanumeric)+ } + +// escaped_char = { "\\" ~ !NEWLINE ~ ("-" | "\\" | "`" | "|" | "*" | "_" | "{" | "}" | "[" | "]" | "(" | ")" | "#" | "+" | "." | "!" | ">" | "<") } + +// entity = { hex_entity | dec_entity | char_entity } + +// endline = _{ line_break | terminal_endline | normal_endline } +// normal_endline = _{ sp ~ NEWLINE ~ !(blank_line | ">" | line ~ ("="+ | "-"+) ~ NEWLINE) } +// terminal_endline = _{ sp ~ NEWLINE ~ EOI } +// line_break = _{ "  " ~ normal_endline } + +// symbol = { special_char } + +// application_depent = { !("`_" | "``_") ~ "`" ~ !"``" ~ target_name_qu ~ "`" ~ !("``" | "_") } + +// // This keeps the parser from getting bogged down on long strings of "*" or "_", +// // or strings of "*" or "_" with space on each side: +// ul_or_star_line = { ul_line | star_line } +// star_line = { "****" ~ "*"* | spacechar ~ "*"+ ~ &spacechar } +// ul_line = { "____" ~ "_"* | spacechar ~ "_"+ ~ &spacechar } + + +// empty_title = { "" } + +// ticks_2 = { "``" ~ !"`" } + +// code = { ticks_2 ~ ( (!"`" ~ nonspacechar)+ | "_" | !ticks_2 ~ "`" | !(sp ~ ticks_2) ~ (spacechar | NEWLINE ~ !blank_line) )+ ~ ticks_2 } + + +// quoted = { +//     "\"" ~ (!"\"" ~ ANY)* ~ "\"" | +//     "'"  ~ (!"'"  ~ ANY)* ~ "'" +// } +// spacechar = _{ " " | "\t" } +// sp = _{ spacechar* } +// spnl = _{ sp ~ (NEWLINE ~ sp)? } +// special_char = _{ "~" | "*" | "_" | "`" | "&" | "[" | "]" | "(" | ")" | "<" | "!" | "#" | "\\" | "\"" | "'" | extended_special_char } +// normal_char = _{ !( special_char | spacechar | NEWLINE ) ~ ANY } +// alphanumeric = { +//     ASCII_ALPHANUMERIC | +//     "\u{200}" | "\u{201}" | "\u{202}" | "\u{203}" | "\u{204}" | "\u{205}" | "\u{206}" | "\u{207}" | +//     "\u{210}" | "\u{211}" | "\u{212}" | "\u{213}" | "\u{214}" | "\u{215}" | "\u{216}" | "\u{217}" | +//     "\u{220}" | "\u{221}" | "\u{222}" | "\u{223}" | "\u{224}" | "\u{225}" | "\u{226}" | "\u{227}" | +//     "\u{230}" | "\u{231}" | "\u{232}" | "\u{233}" | "\u{234}" | "\u{235}" | "\u{236}" | "\u{237}" | +//     "\u{240}" | "\u{241}" | "\u{242}" | "\u{243}" | "\u{244}" | "\u{245}" | "\u{246}" | "\u{247}" | +//     "\u{250}" | "\u{251}" | "\u{252}" | "\u{253}" | "\u{254}" | "\u{255}" | "\u{256}" | "\u{257}" | +//     "\u{260}" | "\u{261}" | "\u{262}" | "\u{263}" | "\u{264}" | "\u{265}" | "\u{266}" | "\u{267}" | +//     "\u{270}" | "\u{271}" | "\u{272}" | "\u{273}" | "\u{274}" | "\u{275}" | "\u{276}" | "\u{277}" | +//     "\u{300}" | "\u{301}" | "\u{302}" | "\u{303}" | "\u{304}" | "\u{305}" | "\u{306}" | "\u{307}" | +//     "\u{310}" | "\u{311}" | "\u{312}" | "\u{313}" | "\u{314}" | "\u{315}" | "\u{316}" | "\u{317}" | +//     "\u{320}" | "\u{321}" | "\u{322}" | "\u{323}" | "\u{324}" | "\u{325}" | "\u{326}" | "\u{327}" | +//     "\u{330}" | "\u{331}" | "\u{332}" | "\u{333}" | "\u{334}" | "\u{335}" | "\u{336}" | "\u{337}" | +//     "\u{340}" | "\u{341}" | "\u{342}" | "\u{343}" | "\u{344}" | "\u{345}" | "\u{346}" | "\u{347}" | +//     "\u{350}" | "\u{351}" | "\u{352}" | "\u{353}" | "\u{354}" | "\u{355}" | "\u{356}" | "\u{357}" | +//     "\u{360}" | "\u{361}" | "\u{362}" | "\u{363}" | "\u{364}" | "\u{365}" | "\u{366}" | "\u{367}" | +//     "\u{370}" | "\u{371}" | "\u{372}" | "\u{373}" | "\u{374}" | "\u{375}" | "\u{376}" | "\u{377}" +// } + +// hex_entity = { "&#" ~ ("X"|"x") ~ ('0'..'9' | 'a'..'f' | 'A'..'F')+ ~ ";" } +// dec_entity = { "&#" ~ ASCII_DIGIT+ ~ ";" } +// char_entity = { "&" ~ ASCII_ALPHANUMERIC+ ~ ";" } + +// indent = _{ "\t" | "   " } +// indented_line = { indent ~ line } +// optionally_indented_line = { indent? ~ line } + +// doctest_line = { ">>> " ~ raw_line } + +// line = _{ raw_line } + +// raw_line = _{ (!NEWLINE ~ ANY)* ~ NEWLINE | (!EOI ~ ANY)+ ~ EOI } + +// // Syntax extensions + +// extended_special_char = { +//     //&{ extension(EXT_SMART) } ~ +//     ("." | "-" | "\"" | "'") | +//     //&{ extension(EXT_NOTES) } ~ +//     "^" +// } + +// smart = { +//     //&{ extension(EXT_SMART) } ~ +//     ( ellipsis | dash | single_quoted | double_quoted | apostrophe ) +// } + +// apostrophe = { "'" } + +// ellipsis = { "..." | ". . ." } + +// dash = { em_dash | en_dash } +// en_dash = { "-" ~ &ASCII_DIGIT } +// em_dash = { "---" | "--" } + +// single_quote_start = { "'" ~ !(spacechar | NEWLINE) } +// single_quote_end = { "'" ~ !alphanumeric } +// single_quoted = { single_quote_start ~ ( !single_quote_end ~ inline )+ ~ single_quote_end } + +// double_quote_start = { "\"" } +// double_quote_end = { "\"" } +// double_quoted = { double_quote_start ~ ( !double_quote_end ~ inline )+ ~ double_quote_end } + +// footnote = { "[#" ~ (!"]" ~ inline)+ ~ "]_" } + +// definition = { +//     &( (!defmark ~ nonspacechar ~ raw_line) ~ blank_line? ~ defmark) ~ +//     d_list_title+ ~ +//     (def_tight | def_loose) +// } +// d_list_title = { !defmark ~ &nonspacechar ~ (!endline ~ inline)+ ~ sp ~ NEWLINE } +// def_tight = { &defmark ~ list_tight } +// def_loose = { blank_line ~ &defmark ~ list_loose } +// defmark = { (":" | "~") ~ spacechar+ } +// def_marker = { +//     //&{ extension(EXT_DLISTS) } ~ +//     defmark +// } diff --git a/parser/src/simplify.rs b/parser/src/simplify.rs new file mode 100644 index 0000000..7974991 --- /dev/null +++ b/parser/src/simplify.rs @@ -0,0 +1,662 @@ +/* +http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets + +Links can have internal or external targets. +In the source, targets look like: + +	.. targetname1: +	.. targetname2: + +	some paragraph or list item or so + +or: + +    .. targetname1: +	.. targetname2: https://link + +There’s also anonymous links and targets without names. + +TODO: continue documenting how it’s done via https://repo.or.cz/docutils.git/blob/HEAD:/docutils/docutils/transforms/references.py +*/ + +use std::collections::HashMap; + +use document_tree::{ +	url::Url, +	Document, +	HasChildren, +	attribute_types::NameToken, +	elements::{self as e, Element}, +	element_categories as c, +	extra_attributes::ExtraAttributes, +}; + + +#[derive(Debug)] +enum NamedTargetType { +	NumberedFootnote(usize), +	LabeledFootnote(usize), +	Citation, +	InternalLink, +	ExternalLink(Url), +	IndirectLink(NameToken), +	SectionTitle, +} +impl NamedTargetType { +	fn is_implicit_target(&self) -> bool { +		match self { +			NamedTargetType::SectionTitle => true, +			_ => false, +		} +	} +} + +#[derive(Clone, Debug)] +struct Substitution { +	content: Vec<c::TextOrInlineElement>, +	/// If true and the sibling before the reference is a text node, +	/// the text node gets right-trimmed.  +	ltrim: bool, +	/// Same as `ltrim` with the sibling after the reference. +	rtrim: bool, +} + +#[derive(Default, Debug)] +struct TargetsCollected { +	named_targets: HashMap<NameToken, NamedTargetType>, +	substitutions: HashMap<NameToken, Substitution>, +	normalized_substitutions: HashMap<String, Substitution>, +} +impl TargetsCollected { +	fn target_url<'t>(self: &'t TargetsCollected, refname: &[NameToken]) -> Option<&'t Url> { +		// TODO: Check if the target would expand circularly +		if refname.len() != 1 { +			panic!("Expected exactly one name in a reference."); +		} +		let name = refname[0].clone(); +		match self.named_targets.get(&name)? { +			NamedTargetType::ExternalLink(url) => Some(url), +			_ => unimplemented!(), +		} +	} +	 +	fn substitution<'t>(self: &'t TargetsCollected, refname: &[NameToken]) -> Option<&'t Substitution> { +		// TODO: Check if the substitution would expand circularly +		if refname.len() != 1 { +			panic!("Expected exactly one name in a substitution reference."); +		} +		let name = refname[0].clone(); +		self.substitutions.get(&name).or_else(|| { +			self.normalized_substitutions.get(&name.0.to_lowercase()) +		}) +	} +} + +trait ResolvableRefs { +	fn populate_targets(&self, refs: &mut TargetsCollected); +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> where Self: Sized; +} + +pub fn resolve_references(mut doc: Document) -> Document { +	let mut references: TargetsCollected = Default::default(); +	for c in doc.children() { +		c.populate_targets(&mut references); +	} +	let new: Vec<_> = doc.children_mut().drain(..).flat_map(|c| c.resolve_refs(&references)).collect(); +	Document::with_children(new) +} + +fn sub_pop<P, C>(parent: &P, refs: &mut TargetsCollected) where P: HasChildren<C>, C: ResolvableRefs { +	for c in parent.children() { +		c.populate_targets(refs); +	} +} + +fn sub_res<P, C>(mut parent: P, refs: &TargetsCollected) -> P where P: e::Element + HasChildren<C>, C: ResolvableRefs { +	let new: Vec<_> = parent.children_mut().drain(..).flat_map(|c| c.resolve_refs(refs)).collect(); +	parent.children_mut().extend(new); +	parent +} + +fn sub_sub_pop<P, C1, C2>(parent: &P, refs: &mut TargetsCollected) where P: HasChildren<C1>, C1: HasChildren<C2>, C2: ResolvableRefs { +	for c in parent.children() { +		sub_pop(c, refs); +	} +} + +fn sub_sub_res<P, C1, C2>(mut parent: P, refs: &TargetsCollected) -> P where P: e::Element + HasChildren<C1>, C1: e::Element + HasChildren<C2>, C2: ResolvableRefs { +	let new: Vec<_> = parent.children_mut().drain(..).map(|c| sub_res(c, refs)).collect(); +	parent.children_mut().extend(new); +	parent +} + +impl ResolvableRefs for c::StructuralSubElement { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::StructuralSubElement::*; +		match self { +			Title(e)        => sub_pop(&**e, refs), +			Subtitle(e)     => sub_pop(&**e, refs), +			Decoration(e)   => sub_pop(&**e, refs), +			Docinfo(e)      => sub_pop(&**e, refs), +			SubStructure(e) => e.populate_targets(refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::StructuralSubElement::*; +		vec![match self { +			Title(e)        => sub_res(*e, refs).into(), +			Subtitle(e)     => sub_res(*e, refs).into(), +			Decoration(e)   => sub_res(*e, refs).into(), +			Docinfo(e)      => sub_res(*e, refs).into(), +			SubStructure(e) => return e.resolve_refs(refs).drain(..).map(Into::into).collect(), +		}] +	} +} + +impl ResolvableRefs for c::SubStructure { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubStructure::*; +		match self { +			Topic(e) => sub_pop(&**e, refs), +			Sidebar(e) => sub_pop(&**e, refs), +			Transition(_) => {}, +			Section(e) => sub_pop(&**e, refs), +			BodyElement(e) => e.populate_targets(refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubStructure::*; +		vec![match self { +			Topic(e) => sub_res(*e, refs).into(), +			Sidebar(e) => sub_res(*e, refs).into(), +			Transition(e) => Transition(e), +			Section(e) => sub_res(*e, refs).into(), +			BodyElement(e) => return e.resolve_refs(refs).drain(..).map(Into::into).collect(), +		}] +	} +} + +impl ResolvableRefs for c::BodyElement { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::BodyElement::*; +		match self { +			Paragraph(e) => sub_pop(&**e, refs), +			LiteralBlock(e) => sub_pop(&**e, refs), +			DoctestBlock(e) => sub_pop(&**e, refs), +			MathBlock(_) => {}, +			Rubric(e) => sub_pop(&**e, refs), +			SubstitutionDefinition(e) => { +				let subst = Substitution { +					content: e.children().clone(), +					ltrim: e.extra().ltrim, +					rtrim: e.extra().rtrim +				}; +				for name in e.names() { +					if refs.substitutions.contains_key(name) { +						// TODO: Duplicate substitution name (level 3 system message). +					} +					// Intentionally overriding any previous values. +					refs.substitutions.insert(name.clone(), subst.clone()); +					refs.normalized_substitutions.insert(name.0.to_lowercase(), subst.clone()); +				} +			}, +			Comment(_) => {}, +			Pending(_) => { +				unimplemented!(); +			}, +			Target(e) => { +				if let Some(uri) = &e.extra().refuri { +					for name in e.names() { +						refs.named_targets.insert(name.clone(), NamedTargetType::ExternalLink(uri.clone())); +					} +				} +				// TODO: as is, people can only refer to the target directly containing the URL. +				// add refid and refnames to some HashMap and follow those later. +			}, +			Raw(_) => {}, +			Image(_) => {}, +			Compound(e) => sub_pop(&**e, refs), +			Container(e) => sub_pop(&**e, refs), +			BulletList(e) => sub_sub_pop(&**e, refs), +			EnumeratedList(e) => sub_sub_pop(&**e, refs), +			DefinitionList(e) => sub_sub_pop(&**e, refs), +			FieldList(e) => sub_sub_pop(&**e, refs), +			OptionList(e) => sub_sub_pop(&**e, refs), +			LineBlock(e) => sub_pop(&**e, refs), +			BlockQuote(e) => sub_pop(&**e, refs), +			Admonition(e) => sub_pop(&**e, refs), +			Attention(e) => sub_pop(&**e, refs), +			Hint(e) => sub_pop(&**e, refs), +			Note(e) => sub_pop(&**e, refs), +			Caution(e) => sub_pop(&**e, refs), +			Danger(e) => sub_pop(&**e, refs), +			Error(e) => sub_pop(&**e, refs), +			Important(e) => sub_pop(&**e, refs), +			Tip(e) => sub_pop(&**e, refs), +			Warning(e) => sub_pop(&**e, refs), +			Footnote(e) => sub_pop(&**e, refs), +			Citation(e) => sub_pop(&**e, refs), +			SystemMessage(e) => sub_pop(&**e, refs), +			Figure(e) => sub_pop(&**e, refs), +			Table(e) => sub_pop(&**e, refs) +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::BodyElement::*; +		vec![match self { +			Paragraph(e) => sub_res(*e, refs).into(), +			LiteralBlock(e) => sub_res(*e, refs).into(), +			DoctestBlock(e) => sub_res(*e, refs).into(), +			MathBlock(e) => MathBlock(e), +			Rubric(e) => sub_res(*e, refs).into(), +			SubstitutionDefinition(_) => return vec![], +			Comment(e) => Comment(e), +			Pending(e) => Pending(e), +			Target(e) => Target(e), +			Raw(e) => Raw(e), +			Image(e) => Image(e), +			Compound(e) => sub_res(*e, refs).into(), +			Container(e) => sub_res(*e, refs).into(), +			BulletList(e) => sub_sub_res(*e, refs).into(), +			EnumeratedList(e) => sub_sub_res(*e, refs).into(), +			DefinitionList(e) => sub_sub_res(*e, refs).into(), +			FieldList(e) => sub_sub_res(*e, refs).into(), +			OptionList(e) => sub_sub_res(*e, refs).into(), +			LineBlock(e) => sub_res(*e, refs).into(), +			BlockQuote(e) => sub_res(*e, refs).into(), +			Admonition(e) => sub_res(*e, refs).into(), +			Attention(e) => sub_res(*e, refs).into(), +			Hint(e) => sub_res(*e, refs).into(), +			Note(e) => sub_res(*e, refs).into(), +			Caution(e) => sub_res(*e, refs).into(), +			Danger(e) => sub_res(*e, refs).into(), +			Error(e) => sub_res(*e, refs).into(), +			Important(e) => sub_res(*e, refs).into(), +			Tip(e) => sub_res(*e, refs).into(), +			Warning(e) => sub_res(*e, refs).into(), +			Footnote(e) => sub_res(*e, refs).into(), +			Citation(e) => sub_res(*e, refs).into(), +			SystemMessage(e) => sub_res(*e, refs).into(), +			Figure(e) => sub_res(*e, refs).into(), +			Table(e) => sub_res(*e, refs).into() +		}] +	} +} + +impl ResolvableRefs for c::BibliographicElement { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::BibliographicElement::*; +		match self { +			Author(e) => sub_pop(&**e, refs), +			Authors(e) => sub_pop(&**e, refs), +			Organization(e) => sub_pop(&**e, refs), +			Address(e) => sub_pop(&**e, refs), +			Contact(e) => sub_pop(&**e, refs), +			Version(e) => sub_pop(&**e, refs), +			Revision(e) => sub_pop(&**e, refs), +			Status(e) => sub_pop(&**e, refs), +			Date(e) => sub_pop(&**e, refs), +			Copyright(e) => sub_pop(&**e, refs), +			Field(e) => sub_pop(&**e, refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::BibliographicElement::*; +		vec![match self { +			Author(e) => sub_res(*e, refs).into(), +			Authors(e) => sub_res(*e, refs).into(), +			Organization(e) => sub_res(*e, refs).into(), +			Address(e) => sub_res(*e, refs).into(), +			Contact(e) => sub_res(*e, refs).into(), +			Version(e) => sub_res(*e, refs).into(), +			Revision(e) => sub_res(*e, refs).into(), +			Status(e) => sub_res(*e, refs).into(), +			Date(e) => sub_res(*e, refs).into(), +			Copyright(e) => sub_res(*e, refs).into(), +			Field(e) => sub_res(*e, refs).into(), +		}] +	} +} + +impl ResolvableRefs for c::TextOrInlineElement { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::TextOrInlineElement::*; +		match self { +			String(_) => {}, +			Emphasis(e) => sub_pop(&**e, refs), +			Strong(e) => sub_pop(&**e, refs), +			Literal(e) => sub_pop(&**e, refs), +			Reference(e) => sub_pop(&**e, refs), +			FootnoteReference(e) => sub_pop(&**e, refs), +			CitationReference(e) => sub_pop(&**e, refs), +			SubstitutionReference(e) => sub_pop(&**e, refs), +			TitleReference(e) => sub_pop(&**e, refs), +			Abbreviation(e) => sub_pop(&**e, refs), +			Acronym(e) => sub_pop(&**e, refs), +			Superscript(e) => sub_pop(&**e, refs), +			Subscript(e) => sub_pop(&**e, refs), +			Inline(e) => sub_pop(&**e, refs), +			Problematic(e) => sub_pop(&**e, refs), +			Generated(e) => sub_pop(&**e, refs), +			Math(_) => {}, +			TargetInline(_) => { +				unimplemented!(); +			}, +			RawInline(_) => {}, +			ImageInline(_) => {} +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::TextOrInlineElement::*; +		vec![match self { +			String(e) => String(e), +			Emphasis(e) => sub_res(*e, refs).into(), +			Strong(e) => sub_res(*e, refs).into(), +			Literal(e) => sub_res(*e, refs).into(), +			Reference(mut e) => { +				if e.extra().refuri.is_none() { +					if let Some(uri) = refs.target_url(&e.extra().refname) { +						e.extra_mut().refuri = Some(uri.clone()); +					} +				} +				(*e).into() +			}, +			FootnoteReference(e) => sub_res(*e, refs).into(), +			CitationReference(e) => sub_res(*e, refs).into(), +			SubstitutionReference(e) => match refs.substitution(&e.extra().refname) { +				Some(Substitution {content, ltrim, rtrim}) => { +					// (level 3 system message). +					// TODO: ltrim and rtrim. +					if *ltrim || *rtrim { +						dbg!(content, ltrim, rtrim); +					} +					return content.clone() +				}, +				None => { +					// Undefined substitution name (level 3 system message). +					// TODO: This replaces the reference by a Problematic node. +					// The corresponding SystemMessage node should go in a generated +					// section with class "system-messages" at the end of the document. +					use document_tree::Problematic; +					let mut replacement: Box<Problematic> = Box::new(Default::default()); +					replacement.children_mut().push( +						c::TextOrInlineElement::String(Box::new(format!("|{}|", e.extra().refname[0].0))) +					); +					// TODO: Create an ID for replacement for the system_message to reference. +					// TODO: replacement.refid pointing to the system_message. +					Problematic(replacement) +				} +			}, +			TitleReference(e) => sub_res(*e, refs).into(), +			Abbreviation(e) => sub_res(*e, refs).into(), +			Acronym(e) => sub_res(*e, refs).into(), +			Superscript(e) => sub_res(*e, refs).into(), +			Subscript(e) => sub_res(*e, refs).into(), +			Inline(e) => sub_res(*e, refs).into(), +			Problematic(e) => sub_res(*e, refs).into(), +			Generated(e) => sub_res(*e, refs).into(), +			Math(e) => Math(e), +			TargetInline(e) => TargetInline(e), +			RawInline(e) => RawInline(e), +			ImageInline(e) => ImageInline(e) +		}] +	} +} + +impl ResolvableRefs for c::AuthorInfo { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::AuthorInfo::*; +		match self { +			Author(e) => sub_pop(&**e, refs), +			Organization(e) => sub_pop(&**e, refs), +			Address(e) => sub_pop(&**e, refs), +			Contact(e) => sub_pop(&**e, refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::AuthorInfo::*; +		vec![match self { +			Author(e) => sub_res(*e, refs).into(), +			Organization(e) => sub_res(*e, refs).into(), +			Address(e) => sub_res(*e, refs).into(), +			Contact(e) => sub_res(*e, refs).into(), +		}] +	} +} + +impl ResolvableRefs for c::DecorationElement { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::DecorationElement::*; +		match self { +			Header(e) => sub_pop(&**e, refs), +			Footer(e) => sub_pop(&**e, refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::DecorationElement::*; +		vec![match self { +			Header(e) => sub_res(*e, refs).into(), +			Footer(e) => sub_res(*e, refs).into(), +		}] +	} +} + +impl ResolvableRefs for c::SubTopic { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubTopic::*; +		match self { +			Title(e) => sub_pop(&**e, refs), +			BodyElement(e) => e.populate_targets(refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubTopic::*; +		match self { +			Title(e) => vec![sub_res(*e, refs).into()], +			BodyElement(e) => e.resolve_refs(refs).drain(..).map(Into::into).collect(), +		} +	} +} + +impl ResolvableRefs for c::SubSidebar { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubSidebar::*; +		match self { +			Topic(e) => sub_pop(&**e, refs), +			Title(e) => sub_pop(&**e, refs), +			Subtitle(e) => sub_pop(&**e, refs), +			BodyElement(e) => e.populate_targets(refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubSidebar::*; +		vec![match self { +			Topic(e) => sub_res(*e, refs).into(), +			Title(e) => sub_res(*e, refs).into(), +			Subtitle(e) => sub_res(*e, refs).into(), +			BodyElement(e) => return e.resolve_refs(refs).drain(..).map(Into::into).collect(), +		}] +	} +} + +impl ResolvableRefs for c::SubDLItem { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubDLItem::*; +		match self { +			Term(e) => sub_pop(&**e, refs), +			Classifier(e) => sub_pop(&**e, refs), +			Definition(e) => sub_pop(&**e, refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubDLItem::*; +		vec![match self { +			Term(e) => sub_res(*e, refs).into(), +			Classifier(e) => sub_res(*e, refs).into(), +			Definition(e) => sub_res(*e, refs).into(), +		}] +	} +} + +impl ResolvableRefs for c::SubField { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubField::*; +		match self { +			FieldName(e) => sub_pop(&**e, refs), +			FieldBody(e) => sub_pop(&**e, refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubField::*; +		vec![match self { +			FieldName(e) => sub_res(*e, refs).into(), +			FieldBody(e) => sub_res(*e, refs).into(), +		}] +	} +} + +impl ResolvableRefs for c::SubOptionListItem { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubOptionListItem::*; +		match self { +			OptionGroup(e) => sub_sub_pop(&**e, refs), +			Description(e) => sub_pop(&**e, refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubOptionListItem::*; +		vec![match self { +			OptionGroup(e) => sub_sub_res(*e, refs).into(), +			Description(e) => sub_res(*e, refs).into(), +		}] +	} +} + +impl ResolvableRefs for c::SubOption { +	fn populate_targets(&self, _: &mut TargetsCollected) {} +	fn resolve_refs(self, _: &TargetsCollected) -> Vec<Self> { vec![self] } +} + +impl ResolvableRefs for c::SubLineBlock { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubLineBlock::*; +		match self { +			LineBlock(e) => sub_pop(&**e, refs), +			Line(e) => sub_pop(&**e, refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubLineBlock::*; +		vec![match self { +			LineBlock(e) => sub_res(*e, refs).into(), +			Line(e) => sub_res(*e, refs).into(), +		}] +	} +} + +impl ResolvableRefs for c::SubBlockQuote { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubBlockQuote::*; +		match self { +			Attribution(e) => sub_pop(&**e, refs), +			BodyElement(e) => e.populate_targets(refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubBlockQuote::*; +		match self { +			Attribution(e) => vec![sub_res(*e, refs).into()], +			BodyElement(e) => e.resolve_refs(refs).drain(..).map(Into::into).collect(), +		} +	} +} + +impl ResolvableRefs for c::SubFootnote { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubFootnote::*; +		match self { +			Label(e) => sub_pop(&**e, refs), +			BodyElement(e) => e.populate_targets(refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubFootnote::*; +		match self { +			Label(e) => vec![sub_res(*e, refs).into()], +			BodyElement(e) => e.resolve_refs(refs).drain(..).map(Into::into).collect(), +		} +	} +} + +impl ResolvableRefs for c::SubFigure { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubFigure::*; +		match self { +			Caption(e) => sub_pop(&**e, refs), +			Legend(e) => sub_pop(&**e, refs), +			BodyElement(e) => e.populate_targets(refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubFigure::*; +		vec![match self { +			Caption(e) => sub_res(*e, refs).into(), +			Legend(e) => sub_res(*e, refs).into(), +			BodyElement(e) => return e.resolve_refs(refs).drain(..).map(Into::into).collect(), +		}] +	} +} + +impl ResolvableRefs for c::SubTable { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubTable::*; +		match self { +			Title(e) => sub_pop(&**e, refs), +			TableGroup(e) => sub_pop(&**e, refs), +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubTable::*; +		vec![match self { +			Title(e) => sub_res(*e, refs).into(), +			TableGroup(e) => sub_res(*e, refs).into(), +		}] +	} +} + +impl ResolvableRefs for c::SubTableGroup { +	fn populate_targets(&self, refs: &mut TargetsCollected) { +		use c::SubTableGroup::*; +		match self { +			TableColspec(_) => { +				unimplemented!(); +			}, +			TableHead(e) => { +				for c in e.children() { +					sub_sub_pop(c, refs); +				} +			}, +			TableBody(e) => { +				for c in e.children() { +					sub_sub_pop(c, refs); +				} +			}, +		} +	} +	fn resolve_refs(self, refs: &TargetsCollected) -> Vec<Self> { +		use c::SubTableGroup::*; +		vec![match self { +			TableColspec(e) => TableColspec(e), +			TableHead(mut e) => { +				let new: Vec<_> = e.children_mut().drain(..).map(|c| sub_sub_res(c, refs)).collect(); +				e.children_mut().extend(new); +				TableHead(e) +			}, +			TableBody(mut e) => { +				let new: Vec<_> = e.children_mut().drain(..).map(|c| sub_sub_res(c, refs)).collect(); +				e.children_mut().extend(new); +				TableBody(e) +			}, +		}] +	} +} diff --git a/parser/src/tests.rs b/parser/src/tests.rs new file mode 100644 index 0000000..1ef965a --- /dev/null +++ b/parser/src/tests.rs @@ -0,0 +1,242 @@ +use pest::consumes_to; +use pest::parses_to; + +use crate::pest_rst::{RstParser, Rule}; + +#[test] +fn plain() { +	parses_to! { +		parser: RstParser, +		input: "line\n", +		rule: Rule::paragraph, +		tokens: [ +			paragraph(0, 4, [ +				str(0, 4) +			]) +		] +	}; +} + +#[test] +fn emph_only() { +	parses_to! { +		parser: RstParser, +		input: "*emphasis*", +		rule: Rule::emph_outer, +		tokens: [ +			emph(1, 9, [str_nested(1, 9)]) +		] +	}; +} + +#[test] +fn emph() { +	parses_to! { +		parser: RstParser, +		input: "line *with markup*\n", +		rule: Rule::paragraph, +		tokens: [ +			paragraph(0, 18, [ +				str(0, 5), +				emph(6, 17, [str_nested(6, 17)]), +			]) +		] +	}; +} + +#[test] +fn title() { +	parses_to! { +		parser: RstParser, +		input: "\ +Title +===== +", +		rule: Rule::title, +		tokens: [ +			title(0, 12, [ title_single(0, 12, [ +				line(0, 6, [ str(0, 5) ]), +				adornments(6, 11), +			]) ]) +		] +	}; +} + +#[test] +fn title_overline() { +	parses_to! { +		parser: RstParser, +		input: "\ +----- +Title +----- +", +		rule: Rule::title, +		tokens: [ +			title(0, 17, [ title_double(0, 17, [ +				adornments(0, 5), +				line(6, 12, [ str(6, 11) ]), +			]) ]) +		] +	}; +} + +#[allow(clippy::cognitive_complexity)] +#[test] +fn two_targets() { +	parses_to! { +		parser: RstParser, +		input: "\ +.. _a: http://example.com +.. _`b_`: https://example.org +", +		rule: Rule::document, +		tokens: [ +			target(0, 26, [ +				target_name_uq(4, 5), +				link_target(7, 25), +			]), +			target(26, 56, [ +				target_name_qu(31, 33), +				link_target(36, 55), +			]), +		] +	}; +} + +#[allow(clippy::cognitive_complexity)] +#[test] +fn admonitions() { +	parses_to! { +		parser: RstParser, +		input: "\ +.. note:: +   Just next line +.. admonition:: In line title + +   Next line + +.. danger:: Just this line +", +		rule: Rule::document, +		tokens: [ +			admonition_gen(0, 27, [ +				admonition_type(3, 7), +				paragraph(13, 27, [ str(13, 27) ]), +			]), +			admonition(28, 71, [ +				line(43, 58, [ str(43, 57) ]), +				paragraph(62, 71, [ str(62, 71) ]), +			]), +			admonition_gen(73, 100, [ +				admonition_type(76, 82), +				line(84, 100, [ str(84, 99) ]), +			]), +		] +	}; +} + + +#[allow(clippy::cognitive_complexity)] +#[test] +fn substitutions() { +	parses_to! { +		parser: RstParser, +		input: "\ +A |subst| in-line + +.. |subst| replace:: substitution +.. |subst2| replace:: it can also +   be hanging +", +		rule: Rule::document, +		tokens: [ +			paragraph(0, 17, [ +				str(0, 2), +				substitution_name(3, 8), +				str(9, 17), +			]), +			substitution_def(19, 52, [ +				substitution_name(23, 28), +				replace(30, 52, [ paragraph(40, 52, [str(40, 52)]) ]), +			]), +			substitution_def(53, 101, [ +				substitution_name(57, 63), +				replace(65, 101, [ paragraph(75, 101, [ +					str(75, 86), ws_newline(86, 87), +					str(88, 100), +				]) ]), +			]), +		] +	}; +} + + +#[allow(clippy::cognitive_complexity)] +#[test] +fn substitution_image() { +	parses_to! { +		parser: RstParser, +		input: "\ +.. |subst| image:: thing.png +   :target: foo.html +", +		rule: Rule::document, +		tokens: [ +			substitution_def(0, 50, [ +				substitution_name(4, 9), +				image(11, 50, [ +					line(18, 29, [ str(18, 28) ]), +					image_option(32, 50, [ +						image_opt_name(33, 39), +						line(40, 50, [ str(40, 49) ]), +					]), +				]), +			]), +		] +	}; +} + +// TODO: test images + +#[allow(clippy::cognitive_complexity)] +#[test] +fn nested_lists() { +	parses_to! { +		parser: RstParser, +		input: "\ +paragraph + +-  item 1 +-  item 2 +   more text +   more text 2 +   more text 3 +   - nested item 1 +   - nested item 2 +   - nested item 3 +", +		rule: Rule::document, +		tokens: [ +			paragraph(0, 9, [ str(0, 9) ]), +			bullet_list(11, 131, [ +				bullet_item(11, 21, [ +					line(14, 21, [ str(14, 20) ]), +				]), +				bullet_item(21, 131, [ +					line(24, 31, [ str(24, 30) ]), +					paragraph(34, 74, [ +						str(34, 43), ws_newline(43, 44), +						str(47, 58), ws_newline(58, 59), +						str(62, 73), +					]), +					bullet_list(77, 131, [ +						bullet_item( 77,  93, [ line( 79,  93, [str( 79,  92)]) ]), +						bullet_item( 96, 112, [ line( 98, 112, [str( 98, 111)]) ]), +						bullet_item(115, 131, [ line(117, 131, [str(117, 130)]) ]), +					]), +				]), +			]), +		] +	} +} diff --git a/parser/src/token.rs b/parser/src/token.rs new file mode 100644 index 0000000..b3b7bac --- /dev/null +++ b/parser/src/token.rs @@ -0,0 +1,16 @@ +//http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#bullet-lists + +// *, +, -, •, ‣, ⁃ +pub enum BulletListType { Ast, Plus, Minus, Bullet, TriBullet, HyphenBullet } +// 1, A, a, I, i +pub enum EnumListChar { Arabic, AlphaUpper, AlphaLower, RomanUpper, RomanLower, Auto } +// 1., (1), 1) +pub enum EnumListType { Period, ParenEnclosed, Paren } +// ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ +pub enum AdornmentChar { +	Bang, DQuote, Hash, Dollar, Percent, Amp, SQuote, LParen, RParen, Ast, Plus, Comma, +	Minus, Period, Slash, Colon, Semicolon, Less, Eq, More, Question, At, LBrack, +	Backslash, RBrack, Caret, Underscore, Backtick, LBrace, Pipe, RBrace, Tilde, +} +// [1], [#], [*], [#foo] +pub enum FootnoteType { Numbered(usize), AutoNumber, AutoSymbol, AutoNamed(String) } | 
