diff options
| author | Andreu Botella | 2019-11-07 00:32:46 +0100 | 
|---|---|---|
| committer | Philipp A | 2019-11-07 09:27:38 +0100 | 
| commit | 5387291c1a2d4cfd0e5acdad26dcc7e33329d39a (patch) | |
| tree | 76df86d6b05af4d9dc2d7e036c1011b848f15f48 | |
| parent | 10cc972f2e4b99e6d3082970ee6982bfee5211c0 (diff) | |
| download | rust-rst-5387291c1a2d4cfd0e5acdad26dcc7e33329d39a.tar.bz2 | |
Updating the parser to recognize most hyperlink references.
| -rw-r--r-- | src/parser/conversion/inline.rs | 106 | ||||
| -rw-r--r-- | src/rst.pest | 88 | 
2 files changed, 175 insertions, 19 deletions
| diff --git a/src/parser/conversion/inline.rs b/src/parser/conversion/inline.rs index 50a6258..c51b2d9 100644 --- a/src/parser/conversion/inline.rs +++ b/src/parser/conversion/inline.rs @@ -1,5 +1,6 @@  use failure::Error;  use pest::iterators::Pair; +use url::Url;  use crate::document_tree::{  	ExtraAttributes, @@ -20,31 +21,116 @@ use super::whitespace_normalize_name;  pub fn convert_inline(pair: Pair<Rule>) -> Result<c::TextOrInlineElement, Error> {  	Ok(match pair.as_rule() {  		Rule::str       		=> pair.as_str().into(), -		Rule::reference 		=> convert_reference(pair)?.into(), +		Rule::reference 		=> convert_reference(pair)?,  		Rule::substitution_ref 	=> convert_substitution(pair)?.into(),  		rule => unimplemented!("unknown rule {:?}", rule),  	})  } -fn convert_reference(pair: Pair<Rule>) -> Result<e::Reference, Error> { +fn convert_reference(pair: Pair<Rule>) -> Result<c::TextOrInlineElement, Error> {  	let name; -	let refuri = None; +	let refuri;  	let refid; -	let refname = vec![]; +	let mut refname = vec![]; +	let mut children: Vec<c::TextOrInlineElement> = vec![];  	let concrete = pair.into_inner().next().unwrap();  	match concrete.as_rule() {  		Rule::reference_target => {  			let rt_inner = concrete.into_inner().next().unwrap(); // reference_target_uq or target_name_qu -			refid = Some(rt_inner.as_str().into()); -			name  = Some(rt_inner.as_str().into()); +			match rt_inner.as_rule() { +				Rule::reference_target_uq => { +					refid  = None; +					name   = Some(rt_inner.as_str().into()); +					refuri = None; +					refname.push(rt_inner.as_str().into()); +					children.push(rt_inner.as_str().into()); +				}, +				Rule::reference_target_qu => { +					let (text, reference) = { +						let mut text = None; +						let mut reference = None; +						for inner in rt_inner.clone().into_inner() { +							match inner.as_rule() { +								Rule::reference_text => text = Some(inner), +								Rule::reference_bracketed => reference = Some(inner), +								_ => unreachable!() +							} +						} +						(text, reference) +					}; +					let trimmed_text = match (&text, &reference) { +						(Some(text), None) => text.as_str(), +						(_, Some(reference)) => { +							text +								.map(|text| text.as_str().trim_end_matches(|ch| " \n\r".contains(ch))) +								.filter(|text| !text.is_empty()) +								.unwrap_or_else(|| reference.clone().into_inner().next().unwrap().as_str()) +						} +						(None, None) => unreachable!() +					}; +					refid = None; +					name = Some(trimmed_text.into()); +					refuri = if let Some(reference) = reference { +						let inner = reference.into_inner().next().unwrap(); +						match inner.as_rule() { +							Rule::url => if let Ok(url) = Url::parse(inner.as_str()) { +								Some(url.into()) +							} else { +								unimplemented!("reference to a relative URL") +							}, +							Rule::target_name_qu => { +								refname.push(inner.as_str().into()); +								None +							}, +							Rule::relative_reference => unimplemented!("reference to a relative URL"), +							_ => unreachable!() +						} +					} else { +						refname.push(trimmed_text.into()); +						None +					}; +					children.push(trimmed_text.into()); +				}, +				_ => unreachable!() +			}  		},  		Rule::reference_explicit => unimplemented!("explicit reference"), -		Rule::reference_auto => unimplemented!("auto reference"), +		Rule::reference_auto => { +			let rt_inner = concrete.into_inner().next().unwrap(); +			match rt_inner.as_rule() { +				Rule::url_auto => match Url::parse(rt_inner.as_str()) { +					Ok(url) => { +						refuri = Some(url.into()); +						name   = None; +						refid  = None; +						children.push(rt_inner.as_str().into()); +					}, +					// if our parser got a URL wrong, return it as a string +					Err(_) => return Ok(rt_inner.as_str().into()) +				}, +				Rule::email => { +					let mailto_url = String::from("mailto:") + rt_inner.as_str(); +					match Url::parse(&mailto_url) { +						Ok(url) => { +							refuri = Some(url.into()); +							name   = None; +							refid  = None; +							children.push(rt_inner.as_str().into()); +						}, +						// if our parser got a URL wrong, return it as a string +						Err(_) => return Ok(rt_inner.as_str().into()) +					} +				}, +				_ => unreachable!() +			} +		},  		_ => unreachable!(),  	}; -	Ok(e::Reference::with_extra( -		a::Reference { name, refuri, refid, refname } -	)) +	Ok(e::Reference::new( +		Default::default(), +		a::Reference { name, refuri, refid, refname }, +		children +	).into())  }  fn convert_substitution(pair: Pair<Rule>) -> Result<e::SubstitutionReference, Error> { diff --git a/src/rst.pest b/src/rst.pest index b9c60e9..289b4f6 100644 --- a/src/rst.pest +++ b/src/rst.pest @@ -6,7 +6,7 @@  // and pest only has one stack that we need for indentation.  document = _{ SOI ~ blocks ~ EOI } -blocks   = _{ block ~ (blank_line* ~ block)* } +blocks   = _{ block ~ (blank_line* ~ block)* ~ blank_line? }  block    = _{ PEEK[..] ~ hanging_block }  // This is the list of all block-level elements @@ -48,7 +48,7 @@ target         =  { target_qu | target_uq }  target_uq      = _{ ".. _"  ~         target_name_uq ~           ":" ~ (" " ~ link_target)? ~ " "* ~ NEWLINE }  target_qu      = _{ ".. _`" ~ !"``" ~ target_name_qu ~ !"``:" ~ "`:" ~ (" " ~ link_target)? ~ " "* ~ NEWLINE }  target_name_uq =  { ( !("_"|":"|"`") ~ !NEWLINE ~ ANY )* } -target_name_qu =  { ( !(    ":"|"`") ~ !NEWLINE ~ ANY )* } +target_name_qu =  { ( !(":"|"`"|"_>") ~ ANY )* }  link_target    =  { nonspacechar+ }  // Title. A block type @@ -128,8 +128,11 @@ str = { (!(NEWLINE | reference | substitution_ref) ~ ANY)+ }  reference = { reference_target | reference_explicit | reference_auto }  reference_target = { reference_target_uq ~ "_" | reference_target_qu } -reference_target_uq =  { (!("_"|":"|"`") ~ nonspacechar)* } -reference_target_qu = _{ ( !("`"? ~ "`_") ~ "`" ~ !"``" ) ~ target_name_qu ~ ( "`" ~ !"``" ) ~ "_" } +reference_target_uq =  { (!("_"|":"|"`") ~ nonspacechar)+ } +reference_target_qu = { ( !("`"? ~ "`_") ~ "`" ~ !"``" ) ~ reference_text? ~ ("<" ~ reference_bracketed ~ ">")? ~ ( "`" ~ !"``" ) ~ "_" } +reference_text = { !"<" ~ ( !("`"|"<") ~ ANY )+ } +reference_bracketed = { url | (target_name_qu ~ "_") | relative_reference } +relative_reference = { (!("`"|">") ~ ANY)+ }  reference_explicit = { reference_label ~ "(" ~ " "* ~ reference_source ~ " "* ~ (NEWLINE ~ PEEK[..])? ~ reference_title ~ " "* ~ ")" }  reference_label = { "[" ~ !"^" ~ (!"]" ~ inline)* ~ "]" } @@ -139,14 +142,81 @@ reference_title        = { ( reference_title_single | reference_title_double | "  reference_title_single = { "'"  ~ ( !("'"  ~ " "+ ~ (")" | NEWLINE)) ~ ANY )* ~ "'" }  reference_title_double = { "\"" ~ ( !("\"" ~ " "+ ~ (")" | NEWLINE)) ~ ANY )* ~ "\"" } -reference_auto = { reference_embedded | reference_auto_url | reference_auto_email } -reference_embedded = { "`" ~ reference_embedded_source ~ "<" ~ ASCII_ALPHA+ ~ "://" ~ (!(NEWLINE|">") ~ ANY)+ ~ ">`_" ~ "_"? } -reference_embedded_source = { ( !("<"|":"|"`") ~ ( " " | nonspacechar | blank_line ) )* } -reference_auto_url = { ASCII_ALPHA+ ~ "://" ~ (!(NEWLINE|">") ~ ANY)+ } -reference_auto_email = { "<" ~ "mailto:"? ~ (ASCII_ALPHANUMERIC|"-"|"+"|"_"|"."|"/"|"!"|"%"|"~"|"$")+ ~ "@" ~ (!(NEWLINE | ">") ~ ANY)+ ~ ">" } +// Emails can't end with punctuation, but URLs must use a separate rule. +reference_auto = { url_auto | email } +//reference_embedded = { "`" ~ reference_embedded_source ~ "<" ~ absolute_url_with_fragment ~ ">`_" ~ "_"? } +//reference_embedded_source = { ( !("<"|":"|"`") ~ ( " " | nonspacechar | blank_line ) )* }  substitution_ref = { "|" ~ substitution_name ~ "|" } +/* URLs as defined by the WHATWG URL standard. */ +url = { absolute_url_no_query ~ ("?" ~ url_unit*)? ~ ("#" ~ url_unit*)? } +absolute_url_no_query = @{ +    ( special_url_scheme ~ ":" ~ scheme_relative_special_url ) | +    ( ^"file:" ~ scheme_relative_file_url ) | +    ( arbitrary_scheme ~ ":" ~ relative_url ) +} +scheme_relative_special_url = @{ "//" ~ host ~ (":" ~ url_port)? ~ path_absolute_url? } +path_absolute_url = @{ "/" ~ path_relative_url } +path_relative_url = @{ ( url_path_segment_unit* ~ "/" )* ~ url_path_segment_unit* } +url_path_segment_unit = @{ !("/"|"?") ~ url_unit } +url_port = @{ ASCII_DIGIT* } +scheme_relative_file_url = @{ "//" ~ ( host ~ !("/:/"|"/|/") )? ~ path_absolute_url } +relative_url = @{ ( "//" ~ host ~ (":" ~ url_port)? ~ path_absolute_url? ) | path_absolute_url | (!(arbitrary_scheme ~ ":") ~ path_relative_url) } +/* this is approximately a superset of valid hosts and opaque hosts */ +host = @{ ( !(":"|"/"|"?"|"#") ~ url_unit)+ | ("["~(ASCII_HEX_DIGIT|"."|":")+~"]") } +special_url_scheme = @{ ^"ftp" | (^"http" | ^"ws") ~ ^"s"? }  /* doesn't include "file" */ +arbitrary_scheme = @{ ASCII_ALPHA ~ ASCII_ALPHANUMERIC* } +url_unit = @{ ASCII_ALPHANUMERIC|"!"|"$"|"&"|"'"|"("|")"|"*"|"+"|","|"-"|"."|"/"|":"|";"|"="|"?"|"@"|"_"|"~"|(!(SURROGATE|NONCHARACTER_CODE_POINT) ~ '\u{A0}'..'\u{10FFFD}')|("%" ~ ASCII_HEX_DIGIT{2}) } + +/* + * Rules for URLs that don't end in punctuation. + * This is a modification of the rules above to incorporate the docutils rules + * for the final character in an auto URL and for the character after it. + * The patterns used here to emulate the behavior of docutils' regex are taken + * from <http://www.inf.puc-rio.br/~roberto/docs/ry10-01.pdf>. + */ +url_auto = { +    ( absolute_url_no_query ~ ("?" ~ url_unit*)? ~ "#" ~ url_units_auto ) | +    ( absolute_url_no_query ~ "?" ~ url_units_auto ) | +    ( special_url_scheme ~ "://" ~ host ~ (":" ~ url_port)? ~ path_absolute_url_auto ) | +    ( special_url_scheme ~ "://" ~ host ~ ":" ~ url_port ~ &follows_auto_url ) | +    ( special_url_scheme ~ "://" ~ ( domain_host_auto | "["~(ASCII_HEX_DIGIT|"."|":")+~"]" ~ &follows_auto_url ) ) | +    ( ^"file://" ~ ( host ~ !("/:/"|"/|/") )? ~ path_absolute_url_auto ) | +    ( arbitrary_scheme ~ ":" ~ relative_url_auto ) +} +domain_host_auto = @{ +    ( !(":"|"/"|"?"|"#") ~ url_unit ~ url_units_auto ) | +    ( !(":"|"/"|"?"|"#") ~ url_unit ~ &">" ) | +    ( (ASCII_ALPHANUMERIC|"_"|"~"|"*"|"/"|"="|"+") ~ &follows_auto_url ) +} +path_absolute_url_auto = @{ "/" ~ path_relative_url_auto } +path_relative_url_auto = @{ prua1 | prua2 | &follows_auto_url } +prua1 = @{ ( url_path_segment_unit ~ prua1 ) | ( "/" ~ path_relative_url_auto ) } +prua2 = @{ ( url_path_segment_unit ~ prua2 ) | ( (ASCII_ALPHANUMERIC|"_"|"~"|"*"|"="|"+") ~ &follows_auto_url ) } +relative_url_auto = @{ +    ( "//" ~ host ~ (":" ~ url_port)? ~ path_absolute_url_auto ) | +    ( "//" ~ host ~ ":" ~ url_port ~ &follows_auto_url ) | +    ( "//" ~ ( domain_host_auto | "["~(ASCII_HEX_DIGIT|"."|":")+~"]" ~ &follows_auto_url ) ) | +    path_absolute_url_auto | +    // (prua1|prua2) is path_relative_url_auto minus the &follows_auto_url case +    (!(arbitrary_scheme ~ ":") ~ (prua1 | prua2)) +} +url_units_auto = @{ +    ( url_unit ~ url_units_auto ) | +    ( url_unit ~ &">" ~ &follows_auto_url ) | +    ( (ASCII_ALPHANUMERIC|"_"|"~"|"*"|"/"|"="|"+") ~ &follows_auto_url ) +} +follows_auto_url = @{ +    EOI|"\x00"|WHITE_SPACE|">"|"\u{201A}"|"\u{201E}"| +    (!(CONNECTOR_PUNCTUATION|OPEN_PUNCTUATION|"#"|"%"|"&"|"*"|"@") ~ PUNCTUATION) +} + +/* Rules for emails as defined by the HTML standard */ +email = { ( email_atext | "." )+ ~ "@" ~ email_label ~ ( "." ~ email_label )* } +email_atext = @{ ASCII_ALPHANUMERIC|"!"|"#"|"$"|"%"|"&"|"'"|"/"|"="|"?"|"^"|"_"|"`"|"{"|"|"|"}"|"~" } +email_label = @{ ASCII_ALPHANUMERIC ~ ( !("-"+ ~ !ASCII_ALPHANUMERIC) ~ (ASCII_ALPHANUMERIC|"-") ){0,62} } +  /*   * character classes   */ | 
