diff options
| -rw-r--r-- | src/sanitizer.js | 115 | ||||
| -rw-r--r-- | test/sanitizerSpec.js | 49 | 
2 files changed, 90 insertions, 74 deletions
| diff --git a/src/sanitizer.js b/src/sanitizer.js index 94c36706..66631a90 100644 --- a/src/sanitizer.js +++ b/src/sanitizer.js @@ -21,7 +21,9 @@ var START_TAG_REGEXP = /^<\s*([\w:]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']    BEGIN_TAG_REGEXP = /^</,    BEGING_END_TAGE_REGEXP = /^<\s*\//,    COMMENT_REGEXP = /<!--(.*?)-->/g, -  CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g; +  CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g, +  URI_REGEXP = /^((ftp|https?):\/\/|mailto:|#)/, +  NON_ALPHANUMERIC_REGEXP = /([^\#-~| |!])/g; // Match everything outside of normal chars and " (quote character)  // Empty Elements - HTML 4.01  var emptyElements = makeMap("area,base,basefont,br,col,hr,img,input,isindex,link,param"); @@ -33,24 +35,24 @@ var blockElements = makeMap("address,blockquote,button,center,dd,del,dir,div,dl,  // Inline Elements - HTML 4.01  var inlineElements = makeMap("a,abbr,acronym,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,img,"+      "input,ins,kbd,label,map,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var"); -  // Elements that you can, intentionally, leave open  // (and which close themselves)  var closeSelfElements = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr"); - -// Attributes that have their values filled in disabled="disabled" -var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); -  // Special Elements (can contain anything)  var specialElements = makeMap("script,style"); -  var validElements = extend({}, emptyElements, blockElements, inlineElements, closeSelfElements); -var validAttrs = extend({}, fillAttrs, makeMap( -    'abbr,align,alink,alt,archive,axis,background,bgcolor,border,cellpadding,cellspacing,cite,class,classid,clear,code,codebase,'+ -    'codetype,color,cols,colspan,content,coords,data,dir,face,for,headers,height,href,hreflang,hspace,id,label,lang,language,'+ -    'link,longdesc,marginheight,marginwidth,maxlength,media,method,name,nowrap,profile,prompt,rel,rev,rows,rowspan,rules,scheme,'+ -    'scope,scrolling,shape,size,span,src,standby,start,summary,tabindex,target,text,title,type,usemap,valign,value,valuetype,'+ -    'vlink,vspace,width')); + +//see: http://www.w3.org/TR/html4/index/attributes.html +//Attributes that have their values filled in disabled="disabled" +var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); +//Attributes that have href and hence need to be sanitized +var uriAttrs = makeMap("background,href,longdesc,src,usemap"); +var validAttrs = extend({}, fillAttrs, uriAttrs, makeMap( +    'abbr,align,alt,axis,bgcolor,border,cellpadding,cellspacing,class,clear,'+ +    'color,cols,colspan,coords,dir,face,for,headers,height,hreflang,hspace,id,'+ +    'label,lang,language,maxlength,method,name,prompt,rel,rev,rows,rowspan,rules,'+ +    'scope,scrolling,shape,size,span,start,summary,tabindex,target,title,type,'+ +    'valign,value,vspace,width'));  /**   * @example @@ -64,7 +66,7 @@ var validAttrs = extend({}, fillAttrs, makeMap(   * @param {string} html string   * @param {object} handler   */ -var htmlParser = function( html, handler ) { +function htmlParser( html, handler ) {    var index, chars, match, stack = [], last = html;    stack.last = function(){ return stack[ stack.length - 1 ]; }; @@ -112,8 +114,7 @@ var htmlParser = function( html, handler ) {          var text = index < 0 ? html : html.substring( 0, index );          html = index < 0 ? "" : html.substring( index ); -        if ( handler.chars ) -          handler.chars( text ); +        handler.chars( decodeEntities(text) );        }      } else { @@ -122,8 +123,7 @@ var htmlParser = function( html, handler ) {            replace(COMMENT_REGEXP, "$1").            replace(CDATA_REGEXP, "$1"); -        if ( handler.chars ) -          handler.chars( text ); +        handler.chars( decodeEntities(text) );          return "";        }); @@ -157,21 +157,18 @@ var htmlParser = function( html, handler ) {      if ( !unary )        stack.push( tagName ); -    if ( handler.start ) { -      var attrs = {}; +    var attrs = {}; -      rest.replace(ATTR_REGEXP, function(match, name) { -        var value = arguments[2] ? arguments[2] : -          arguments[3] ? arguments[3] : -          arguments[4] ? arguments[4] : -          fillAttrs[name] ? name : ""; +    rest.replace(ATTR_REGEXP, function(match, name) { +      var value = arguments[2] ? arguments[2] : +        arguments[3] ? arguments[3] : +        arguments[4] ? arguments[4] : +        fillAttrs[name] ? name : ""; -        attrs[name] = value; //value.replace(/(^|[^\\])"/g, '$1\\\"') //" -      }); +      attrs[name] = decodeEntities(value); //value.replace(/(^|[^\\])"/g, '$1\\\"') //" +    }); -      if ( handler.start ) -        handler.start( tagName, attrs, unary ); -    } +    handler.start( tagName, attrs, unary );    }    function parseEndTag( tag, tagName ) { @@ -186,8 +183,7 @@ var htmlParser = function( html, handler ) {      if ( pos >= 0 ) {        // Close all the open elements, up the stack        for ( i = stack.length - 1; i >= pos; i-- ) -        if ( handler.end ) -          handler.end( stack[ i ] ); +        handler.end( stack[ i ] );        // Remove the open elements from the stack        stack.length = pos; @@ -206,28 +202,32 @@ function makeMap(str){    return obj;  } -/* - * For attack vectors see: http://ha.ckers.org/xss.html +/** + * decodes all entities into regular string + * @param value + * @returns   */ -var JAVASCRIPT_URL = /^javascript:/i, -    NBSP_REGEXP = / /gim, -    HEX_ENTITY_REGEXP = /&#x([\da-f]*);?/igm, -    DEC_ENTITY_REGEXP = /&#(\d+);?/igm, -    CHAR_REGEXP = /[\w:]/gm, -    HEX_DECODE = function(match, code){return fromCharCode(parseInt(code,16));}, -    DEC_DECODE = function(match, code){return fromCharCode(code);}; +var hiddenPre=document.createElement("pre"); +function decodeEntities(value) { +  hiddenPre.innerHTML=value.replace(/</g,"<"); +  return hiddenPre.innerText || hiddenPre.textContent; +} +  /** - * @param {string} url - * @returns true if url decodes to something which starts with 'javascript:' hence unsafe + * Escapes all potentially dangerous characters, so that the  + * resulting string can be safely inserted into attribute or + * element text. + * @param value + * @returns escaped text   */ -function isJavaScriptUrl(url) { -  var chars = []; -  url.replace(NBSP_REGEXP, ''). -      replace(HEX_ENTITY_REGEXP, HEX_DECODE). -      replace(DEC_ENTITY_REGEXP, DEC_DECODE). -      // Remove all non \w: characters, unfurtunetly value.replace(/[\w:]/,'') can be defeated using \u0000 -      replace(CHAR_REGEXP, function(ch){chars.push(ch);}); -  return JAVASCRIPT_URL.test(lowercase(chars.join(''))); +function encodeEntities(value) { +  return value. +    replace(/&/g, '&'). +    replace(NON_ALPHANUMERIC_REGEXP, function(value){ +      return '&#' + value.charCodeAt(0) + ';'; +    }). +    replace(/</g, '<'). +    replace(/>/g, '>');  }  /** @@ -253,14 +253,12 @@ function htmlSanitizeWriter(buf){          out('<');          out(tag);          foreach(attrs, function(value, key){ -          if (validAttrs[lowercase(key)] && !isJavaScriptUrl(value)) { +          var lkey=lowercase(key); +          if (validAttrs[lkey] && (uriAttrs[lkey]!==true || value.match(URI_REGEXP))) {              out(' ');              out(key);              out('="'); -            out(value. -                replace(/</g, '<'). -                replace(/>/g, '>'). -                replace(/\"/g,'"')); +            out(encodeEntities(value));              out('"');            }          }); @@ -280,10 +278,7 @@ function htmlSanitizeWriter(buf){        },      chars: function(chars){          if (!ignore) { -          out(chars. -              replace(/&(\w+[&;\W])?/g, function(match, entity){return entity?match:'&';}). -              replace(/</g, '<'). -              replace(/>/g, '>')); +          out(encodeEntities(chars));          }        }    }; diff --git a/test/sanitizerSpec.js b/test/sanitizerSpec.js index 4e1ff355..88da693d 100644 --- a/test/sanitizerSpec.js +++ b/test/sanitizerSpec.js @@ -6,7 +6,7 @@ describe('HTML', function(){    it('should echo html', function(){      expectHTML('hello<b class="1\'23" align=\'""\'>world</b>.'). -       toEqual('hello<b class="1\'23" align="""">world</b>.'); +       toEqual('hello<b class="1\'23" align="""">world</b>.');    });    it('should remove script', function(){ @@ -49,9 +49,15 @@ describe('HTML', function(){      expectHTML('a<my:hr/><my:div>b</my:div>c').toEqual('abc');    }); +  it('should handle entities', function(){ +    var everything = '<div id="!@#$%^&*()_+-={}[]:";\'<>?,./`~ ħ">' +  +    '!@#$%^&*()_+-={}[]:";\'<>?,./`~ ħ</div>'; +    expectHTML(everything).toEqual(everything); +  }); +      it('should handle improper html', function(){ -    expectHTML('< div id="</div>" alt=abc href=\'"\' >text< /div>'). -      toEqual('<div id="</div>" alt="abc" href=""">text</div>'); +    expectHTML('< div id="</div>" alt=abc dir=\'"\' >text< /div>'). +      toEqual('<div id="</div>" alt="abc" dir=""">text</div>');    });    it('should handle improper html2', function(){ @@ -81,19 +87,14 @@ describe('HTML', function(){        expect(html).toEqual('a<div>&</div>c');      }); -    it('should not double escape entities', function(){ -      writer.chars(' ><'); -      expect(html).toEqual(' ><'); -    }); -      it('should escape IE script', function(){ -      writer.chars('&{}'); -      expect(html).toEqual('&{}'); +      writer.chars('&<>{}'); +      expect(html).toEqual('&<>{}');      });      it('should escape attributes', function(){ -      writer.start('div', {id:'\"\'<>'}); -      expect(html).toEqual('<div id=""\'<>">'); +      writer.start('div', {id:'!@#$%^&*()_+-={}[]:";\'<>?,./`~ \n\0\r\u0127'}); +      expect(html).toEqual('<div id="!@#$%^&*()_+-={}[]:";\'<>?,./`~ 
�
ħ">');      });      it('should ignore missformed elements', function(){ @@ -105,12 +106,32 @@ describe('HTML', function(){        writer.start('div', {unknown:""});        expect(html).toEqual('<div>');      }); +     +    describe('isUri', function(){ +       +      function isUri(value) { +        return value.match(URI_REGEXP); +      } +       +      it('should be URI', function(){ +        expect(isUri('http://abc')).toBeTruthy(); +        expect(isUri('https://abc')).toBeTruthy(); +        expect(isUri('ftp://abc')).toBeTruthy(); +        expect(isUri('mailto:me@example.com')).toBeTruthy(); +        expect(isUri('#anchor')).toBeTruthy(); +      }); +       +      it('should not be UIR', function(){ +        expect(isUri('')).toBeFalsy(); +        expect(isUri('javascript:alert')).toBeFalsy(); +      }); +    });      describe('javascript URL attribute', function(){        beforeEach(function(){          this.addMatchers({            toBeValidUrl: function(){ -            return !isJavaScriptUrl(this.actual); +            return URI_REGEXP.exec(this.actual);            }          });        }); @@ -118,7 +139,7 @@ describe('HTML', function(){        it('should ignore javascript:', function(){          expect('JavaScript:abc').not.toBeValidUrl();          expect(' \n Java\n Script:abc').not.toBeValidUrl(); -        expect('JavaScript/my.js').toBeValidUrl(); +        expect('http://JavaScript/my.js').toBeValidUrl();        });        it('should ignore dec encoded javascript:', function(){ | 
