aboutsummaryrefslogtreecommitdiffstats
path: root/src/sanitizer.js
diff options
context:
space:
mode:
Diffstat (limited to 'src/sanitizer.js')
-rw-r--r--src/sanitizer.js290
1 files changed, 290 insertions, 0 deletions
diff --git a/src/sanitizer.js b/src/sanitizer.js
new file mode 100644
index 00000000..0a0b2907
--- /dev/null
+++ b/src/sanitizer.js
@@ -0,0 +1,290 @@
+/*
+ * HTML Parser By Misko Hevery (misko@hevery.com)
+ * based on: HTML Parser By John Resig (ejohn.org)
+ * Original code by Erik Arvidsson, Mozilla Public License
+ * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
+ *
+ * // Use like so:
+ * htmlParser(htmlString, {
+ * start: function(tag, attrs, unary) {},
+ * end: function(tag) {},
+ * chars: function(text) {},
+ * comment: function(text) {}
+ * });
+ *
+ */
+
+// Regular Expressions for parsing tags and attributes
+var START_TAG_REGEXP = /^<\s*([\w:]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*>/,
+ END_TAG_REGEXP = /^<\s*\/\s*([\w:]+)[^>]*>/,
+ ATTR_REGEXP = /(\w+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g,
+ BEGIN_TAG_REGEXP = /^</,
+ BEGING_END_TAGE_REGEXP = /^<\s*\//,
+ COMMENT_REGEXP = /<!--(.*?)-->/g,
+ CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g;
+
+// Empty Elements - HTML 4.01
+var emptyElements = makeMap("area,base,basefont,br,col,hr,img,input,isindex,link,param");
+
+// Block Elements - HTML 4.01
+var blockElements = makeMap("address,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,"+
+ "form,hr,ins,isindex,li,map,menu,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
+
+// Inline Elements - HTML 4.01
+var inlineElements = makeMap("a,abbr,acronym,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,img,"+
+ "input,ins,kbd,label,map,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
+
+// Elements that you can, intentionally, leave open
+// (and which close themselves)
+var closeSelfElements = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
+
+// Attributes that have their values filled in disabled="disabled"
+var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
+
+// Special Elements (can contain anything)
+var specialElements = makeMap("script,style");
+
+var validElements = extend({}, emptyElements, blockElements, inlineElements, closeSelfElements);
+var validAttrs = extend({}, fillAttrs, makeMap(
+ 'abbr,align,alink,alt,archive,axis,background,bgcolor,border,cellpadding,cellspacing,cite,class,classid,clear,code,codebase,'+
+ 'codetype,color,cols,colspan,content,coords,data,dir,face,for,headers,height,href,hreflang,hspace,id,label,lang,language,'+
+ 'link,longdesc,marginheight,marginwidth,maxlength,media,method,name,nowrap,profile,prompt,rel,rev,rows,rowspan,rules,scheme,'+
+ 'scope,scrolling,shape,size,span,src,standby,start,summary,tabindex,target,text,title,type,usemap,valign,value,valuetype,'+
+ 'vlink,vspace,width'));
+
+/**
+ * @example
+ * htmlParser(htmlString, {
+ * start: function(tag, attrs, unary) {},
+ * end: function(tag) {},
+ * chars: function(text) {},
+ * comment: function(text) {}
+ * });
+ *
+ * @param {string} html string
+ * @param {object} handler
+ */
+var htmlParser = function( html, handler ) {
+ var index, chars, match, stack = [], last = html;
+ stack.last = function(){ return stack[ stack.length - 1 ]; };
+
+ while ( html ) {
+ chars = true;
+
+ // Make sure we're not in a script or style element
+ if ( !stack.last() || !specialElements[ stack.last() ] ) {
+
+ // Comment
+ if ( html.indexOf("<!--") === 0 ) {
+ index = html.indexOf("-->");
+
+ if ( index >= 0 ) {
+ if ( handler.comment )
+ handler.comment( html.substring( 4, index ) );
+ html = html.substring( index + 3 );
+ chars = false;
+ }
+
+ // end tag
+ } else if ( BEGING_END_TAGE_REGEXP.test(html) ) {
+ match = html.match( END_TAG_REGEXP );
+
+ if ( match ) {
+ html = html.substring( match[0].length );
+ match[0].replace( END_TAG_REGEXP, parseEndTag );
+ chars = false;
+ }
+
+ // start tag
+ } else if ( BEGIN_TAG_REGEXP.test(html) ) {
+ match = html.match( START_TAG_REGEXP );
+
+ if ( match ) {
+ html = html.substring( match[0].length );
+ match[0].replace( START_TAG_REGEXP, parseStartTag );
+ chars = false;
+ }
+ }
+
+ if ( chars ) {
+ index = html.indexOf("<");
+
+ var text = index < 0 ? html : html.substring( 0, index );
+ html = index < 0 ? "" : html.substring( index );
+
+ if ( handler.chars )
+ handler.chars( text );
+ }
+
+ } else {
+ html = html.replace(new RegExp("(.*)<\\s*\\/\\s*" + stack.last() + "[^>]*>", 'i'), function(all, text){
+ text = text.
+ replace(COMMENT_REGEXP, "$1").
+ replace(CDATA_REGEXP, "$1");
+
+ if ( handler.chars )
+ handler.chars( text );
+
+ return "";
+ });
+
+ parseEndTag( "", stack.last() );
+ }
+
+ if ( html == last ) {
+ throw "Parse Error: " + html;
+ }
+ last = html;
+ }
+
+ // Clean up any remaining tags
+ parseEndTag();
+
+ function parseStartTag( tag, tagName, rest, unary ) {
+ tagName = lowercase(tagName);
+ if ( blockElements[ tagName ] ) {
+ while ( stack.last() && inlineElements[ stack.last() ] ) {
+ parseEndTag( "", stack.last() );
+ }
+ }
+
+ if ( closeSelfElements[ tagName ] && stack.last() == tagName ) {
+ parseEndTag( "", tagName );
+ }
+
+ unary = emptyElements[ tagName ] || !!unary;
+
+ if ( !unary )
+ stack.push( tagName );
+
+ if ( handler.start ) {
+ var attrs = {};
+
+ rest.replace(ATTR_REGEXP, function(match, name) {
+ var value = arguments[2] ? arguments[2] :
+ arguments[3] ? arguments[3] :
+ arguments[4] ? arguments[4] :
+ fillAttrs[name] ? name : "";
+
+ attrs[name] = value; //value.replace(/(^|[^\\])"/g, '$1\\\"') //"
+ });
+
+ if ( handler.start )
+ handler.start( tagName, attrs, unary );
+ }
+ }
+
+ function parseEndTag( tag, tagName ) {
+ var pos = 0, i;
+ tagName = lowercase(tagName);
+ if ( tagName )
+ // Find the closest opened tag of the same type
+ for ( pos = stack.length - 1; pos >= 0; pos-- )
+ if ( stack[ pos ] == tagName )
+ break;
+
+ if ( pos >= 0 ) {
+ // Close all the open elements, up the stack
+ for ( i = stack.length - 1; i >= pos; i-- )
+ if ( handler.end )
+ handler.end( stack[ i ] );
+
+ // Remove the open elements from the stack
+ stack.length = pos;
+ }
+ }
+};
+
+/**
+ * @param str 'key1,key2,...'
+ * @returns {key1:true, key2:true, ...}
+ */
+function makeMap(str){
+ var obj = {}, items = str.split(","), i;
+ for ( i = 0; i < items.length; i++ )
+ obj[ items[i] ] = true;
+ return obj;
+}
+
+/*
+ * For attack vectors see: http://ha.ckers.org/xss.html
+ */
+var JAVASCRIPT_URL = /^javascript:/i,
+ NBSP_REGEXP = /&nbsp;/gim,
+ HEX_ENTITY_REGEXP = /&#x([\da-f]*);?/igm,
+ DEC_ENTITY_REGEXP = /&#(\d+);?/igm,
+ CHAR_REGEXP = /[\w:]/gm,
+ HEX_DECODE = function(match, code){return fromCharCode(parseInt(code,16));},
+ DEC_DECODE = function(match, code){return fromCharCode(code);};
+/**
+ * @param {string} url
+ * @returns true if url decodes to something which starts with 'javascript:' hence unsafe
+ */
+function isJavaScriptUrl(url) {
+ var chars = [];
+ url.replace(NBSP_REGEXP, '').
+ replace(HEX_ENTITY_REGEXP, HEX_DECODE).
+ replace(DEC_ENTITY_REGEXP, DEC_DECODE).
+ // Remove all non \w: characters, unfurtunetly value.replace(/[\w:]/,'') can be defeated using \u0000
+ replace(CHAR_REGEXP, function(ch){chars.push(ch);});
+ return JAVASCRIPT_URL.test(lowercase(chars.join('')));
+}
+
+/**
+ * create an HTML/XML writer which writes to buffer
+ * @param {Array} buf use buf.jain('') to get out sanitized html string
+ * @returns {
+ * start: function(tag, attrs, unary) {},
+ * end: function(tag) {},
+ * chars: function(text) {},
+ * comment: function(text) {}
+ * }
+ */
+function htmlSanitizeWriter(buf){
+ var ignore = false;
+ var out = bind(buf, buf.push);
+ return {
+ start: function(tag, attrs, unary){
+ tag = lowercase(tag);
+ if (!ignore && specialElements[tag]) {
+ ignore = tag;
+ }
+ if (!ignore && validElements[tag]) {
+ out('<');
+ out(tag);
+ foreach(attrs, function(value, key){
+ if (validAttrs[lowercase(key)] && !isJavaScriptUrl(value)) {
+ out(' ');
+ out(key);
+ out('="');
+ out(value.
+ replace(/</g, '&lt;').
+ replace(/>/g, '&gt;').
+ replace(/\"/g,'&quot;'));
+ out('"');
+ }
+ });
+ out(unary ? '/>' : '>');
+ }
+ },
+ end: function(tag){
+ tag = lowercase(tag);
+ if (!ignore && validElements[tag]) {
+ out('</');
+ out(tag);
+ out('>');
+ }
+ if (tag == ignore) {
+ ignore = false;
+ }
+ },
+ chars: function(chars){
+ if (!ignore) {
+ out(chars.
+ replace(/&(\w+[&;\W])?/g, function(match, entity){return entity?match:'&amp;';}).
+ replace(/</g, '&lt;').
+ replace(/>/g, '&gt;'));
+ }
+ }
+ };
+}