diff options
Diffstat (limited to 'src/sanitizer.js')
| -rw-r--r-- | src/sanitizer.js | 290 |
1 files changed, 290 insertions, 0 deletions
diff --git a/src/sanitizer.js b/src/sanitizer.js new file mode 100644 index 00000000..0a0b2907 --- /dev/null +++ b/src/sanitizer.js @@ -0,0 +1,290 @@ +/* + * HTML Parser By Misko Hevery (misko@hevery.com) + * based on: HTML Parser By John Resig (ejohn.org) + * Original code by Erik Arvidsson, Mozilla Public License + * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js + * + * // Use like so: + * htmlParser(htmlString, { + * start: function(tag, attrs, unary) {}, + * end: function(tag) {}, + * chars: function(text) {}, + * comment: function(text) {} + * }); + * + */ + +// Regular Expressions for parsing tags and attributes +var START_TAG_REGEXP = /^<\s*([\w:]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*>/, + END_TAG_REGEXP = /^<\s*\/\s*([\w:]+)[^>]*>/, + ATTR_REGEXP = /(\w+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g, + BEGIN_TAG_REGEXP = /^</, + BEGING_END_TAGE_REGEXP = /^<\s*\//, + COMMENT_REGEXP = /<!--(.*?)-->/g, + CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g; + +// Empty Elements - HTML 4.01 +var emptyElements = makeMap("area,base,basefont,br,col,hr,img,input,isindex,link,param"); + +// Block Elements - HTML 4.01 +var blockElements = makeMap("address,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,"+ + "form,hr,ins,isindex,li,map,menu,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul"); + +// Inline Elements - HTML 4.01 +var inlineElements = makeMap("a,abbr,acronym,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,img,"+ + "input,ins,kbd,label,map,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var"); + +// Elements that you can, intentionally, leave open +// (and which close themselves) +var closeSelfElements = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr"); + +// Attributes that have their values filled in disabled="disabled" +var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); + +// Special Elements (can contain anything) +var specialElements = makeMap("script,style"); + +var validElements = extend({}, emptyElements, blockElements, inlineElements, closeSelfElements); +var validAttrs = extend({}, fillAttrs, makeMap( + 'abbr,align,alink,alt,archive,axis,background,bgcolor,border,cellpadding,cellspacing,cite,class,classid,clear,code,codebase,'+ + 'codetype,color,cols,colspan,content,coords,data,dir,face,for,headers,height,href,hreflang,hspace,id,label,lang,language,'+ + 'link,longdesc,marginheight,marginwidth,maxlength,media,method,name,nowrap,profile,prompt,rel,rev,rows,rowspan,rules,scheme,'+ + 'scope,scrolling,shape,size,span,src,standby,start,summary,tabindex,target,text,title,type,usemap,valign,value,valuetype,'+ + 'vlink,vspace,width')); + +/** + * @example + * htmlParser(htmlString, { + * start: function(tag, attrs, unary) {}, + * end: function(tag) {}, + * chars: function(text) {}, + * comment: function(text) {} + * }); + * + * @param {string} html string + * @param {object} handler + */ +var htmlParser = function( html, handler ) { + var index, chars, match, stack = [], last = html; + stack.last = function(){ return stack[ stack.length - 1 ]; }; + + while ( html ) { + chars = true; + + // Make sure we're not in a script or style element + if ( !stack.last() || !specialElements[ stack.last() ] ) { + + // Comment + if ( html.indexOf("<!--") === 0 ) { + index = html.indexOf("-->"); + + if ( index >= 0 ) { + if ( handler.comment ) + handler.comment( html.substring( 4, index ) ); + html = html.substring( index + 3 ); + chars = false; + } + + // end tag + } else if ( BEGING_END_TAGE_REGEXP.test(html) ) { + match = html.match( END_TAG_REGEXP ); + + if ( match ) { + html = html.substring( match[0].length ); + match[0].replace( END_TAG_REGEXP, parseEndTag ); + chars = false; + } + + // start tag + } else if ( BEGIN_TAG_REGEXP.test(html) ) { + match = html.match( START_TAG_REGEXP ); + + if ( match ) { + html = html.substring( match[0].length ); + match[0].replace( START_TAG_REGEXP, parseStartTag ); + chars = false; + } + } + + if ( chars ) { + index = html.indexOf("<"); + + var text = index < 0 ? html : html.substring( 0, index ); + html = index < 0 ? "" : html.substring( index ); + + if ( handler.chars ) + handler.chars( text ); + } + + } else { + html = html.replace(new RegExp("(.*)<\\s*\\/\\s*" + stack.last() + "[^>]*>", 'i'), function(all, text){ + text = text. + replace(COMMENT_REGEXP, "$1"). + replace(CDATA_REGEXP, "$1"); + + if ( handler.chars ) + handler.chars( text ); + + return ""; + }); + + parseEndTag( "", stack.last() ); + } + + if ( html == last ) { + throw "Parse Error: " + html; + } + last = html; + } + + // Clean up any remaining tags + parseEndTag(); + + function parseStartTag( tag, tagName, rest, unary ) { + tagName = lowercase(tagName); + if ( blockElements[ tagName ] ) { + while ( stack.last() && inlineElements[ stack.last() ] ) { + parseEndTag( "", stack.last() ); + } + } + + if ( closeSelfElements[ tagName ] && stack.last() == tagName ) { + parseEndTag( "", tagName ); + } + + unary = emptyElements[ tagName ] || !!unary; + + if ( !unary ) + stack.push( tagName ); + + if ( handler.start ) { + var attrs = {}; + + rest.replace(ATTR_REGEXP, function(match, name) { + var value = arguments[2] ? arguments[2] : + arguments[3] ? arguments[3] : + arguments[4] ? arguments[4] : + fillAttrs[name] ? name : ""; + + attrs[name] = value; //value.replace(/(^|[^\\])"/g, '$1\\\"') //" + }); + + if ( handler.start ) + handler.start( tagName, attrs, unary ); + } + } + + function parseEndTag( tag, tagName ) { + var pos = 0, i; + tagName = lowercase(tagName); + if ( tagName ) + // Find the closest opened tag of the same type + for ( pos = stack.length - 1; pos >= 0; pos-- ) + if ( stack[ pos ] == tagName ) + break; + + if ( pos >= 0 ) { + // Close all the open elements, up the stack + for ( i = stack.length - 1; i >= pos; i-- ) + if ( handler.end ) + handler.end( stack[ i ] ); + + // Remove the open elements from the stack + stack.length = pos; + } + } +}; + +/** + * @param str 'key1,key2,...' + * @returns {key1:true, key2:true, ...} + */ +function makeMap(str){ + var obj = {}, items = str.split(","), i; + for ( i = 0; i < items.length; i++ ) + obj[ items[i] ] = true; + return obj; +} + +/* + * For attack vectors see: http://ha.ckers.org/xss.html + */ +var JAVASCRIPT_URL = /^javascript:/i, + NBSP_REGEXP = / /gim, + HEX_ENTITY_REGEXP = /&#x([\da-f]*);?/igm, + DEC_ENTITY_REGEXP = /&#(\d+);?/igm, + CHAR_REGEXP = /[\w:]/gm, + HEX_DECODE = function(match, code){return fromCharCode(parseInt(code,16));}, + DEC_DECODE = function(match, code){return fromCharCode(code);}; +/** + * @param {string} url + * @returns true if url decodes to something which starts with 'javascript:' hence unsafe + */ +function isJavaScriptUrl(url) { + var chars = []; + url.replace(NBSP_REGEXP, ''). + replace(HEX_ENTITY_REGEXP, HEX_DECODE). + replace(DEC_ENTITY_REGEXP, DEC_DECODE). + // Remove all non \w: characters, unfurtunetly value.replace(/[\w:]/,'') can be defeated using \u0000 + replace(CHAR_REGEXP, function(ch){chars.push(ch);}); + return JAVASCRIPT_URL.test(lowercase(chars.join(''))); +} + +/** + * create an HTML/XML writer which writes to buffer + * @param {Array} buf use buf.jain('') to get out sanitized html string + * @returns { + * start: function(tag, attrs, unary) {}, + * end: function(tag) {}, + * chars: function(text) {}, + * comment: function(text) {} + * } + */ +function htmlSanitizeWriter(buf){ + var ignore = false; + var out = bind(buf, buf.push); + return { + start: function(tag, attrs, unary){ + tag = lowercase(tag); + if (!ignore && specialElements[tag]) { + ignore = tag; + } + if (!ignore && validElements[tag]) { + out('<'); + out(tag); + foreach(attrs, function(value, key){ + if (validAttrs[lowercase(key)] && !isJavaScriptUrl(value)) { + out(' '); + out(key); + out('="'); + out(value. + replace(/</g, '<'). + replace(/>/g, '>'). + replace(/\"/g,'"')); + out('"'); + } + }); + out(unary ? '/>' : '>'); + } + }, + end: function(tag){ + tag = lowercase(tag); + if (!ignore && validElements[tag]) { + out('</'); + out(tag); + out('>'); + } + if (tag == ignore) { + ignore = false; + } + }, + chars: function(chars){ + if (!ignore) { + out(chars. + replace(/&(\w+[&;\W])?/g, function(match, entity){return entity?match:'&';}). + replace(/</g, '<'). + replace(/>/g, '>')); + } + } + }; +} |
