diff options
Diffstat (limited to 'lib/htmlparser/htmlparser.js')
| -rw-r--r-- | lib/htmlparser/htmlparser.js | 309 | 
1 files changed, 309 insertions, 0 deletions
| diff --git a/lib/htmlparser/htmlparser.js b/lib/htmlparser/htmlparser.js new file mode 100644 index 00000000..46a3da08 --- /dev/null +++ b/lib/htmlparser/htmlparser.js @@ -0,0 +1,309 @@ +/* + * HTML Parser By John Resig (ejohn.org) + * Original code by Erik Arvidsson, Mozilla Public License + * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js + * + * // Use like so: + * htmlParser(htmlString, { + *     start: function(tag, attrs, unary) {}, + *     end: function(tag) {}, + *     chars: function(text) {}, + *     comment: function(text) {} + * }); + * + * // or to get an XML string: + * HTMLtoXML(htmlString); + * + * // or to get an XML DOM Document + * HTMLtoDOM(htmlString); + * + * // or to inject into an existing document/DOM node + * HTMLtoDOM(htmlString, document); + * HTMLtoDOM(htmlString, document.body); + * + */ + +(function(){ + +  // Regular Expressions for parsing tags and attributes +  var startTag = /^<(\w+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/, +    endTag = /^<\/(\w+)[^>]*>/, +    attr = /(\w+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g; + +  // Empty Elements - HTML 4.01 +  var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed"); + +  // Block Elements - HTML 4.01 +  var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul"); + +  // Inline Elements - HTML 4.01 +  var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var"); + +  // Elements that you can, intentionally, leave open +  // (and which close themselves) +  var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr"); + +  // Attributes that have their values filled in disabled="disabled" +  var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); + +  // Special Elements (can contain anything) +  var special = makeMap("script,style"); + +  var htmlParser = this.htmlParser = function( html, handler ) { +    var index, chars, match, stack = [], last = html; +    stack.last = function(){ +      return this[ this.length - 1 ]; +    }; + +    while ( html ) { +      chars = true; + +      // Make sure we're not in a script or style element +      if ( !stack.last() || !special[ stack.last() ] ) { + +        // Comment +        if ( html.indexOf("<!--") == 0 ) { +          index = html.indexOf("-->"); + +          if ( index >= 0 ) { +            if ( handler.comment ) +              handler.comment( html.substring( 4, index ) ); +            html = html.substring( index + 3 ); +            chars = false; +          } + +        // end tag +        } else if ( html.indexOf("</") == 0 ) { +          match = html.match( endTag ); + +          if ( match ) { +            html = html.substring( match[0].length ); +            match[0].replace( endTag, parseEndTag ); +            chars = false; +          } + +        // start tag +        } else if ( html.indexOf("<") == 0 ) { +          match = html.match( startTag ); + +          if ( match ) { +            html = html.substring( match[0].length ); +            match[0].replace( startTag, parseStartTag ); +            chars = false; +          } +        } + +        if ( chars ) { +          index = html.indexOf("<"); + +          var text = index < 0 ? html : html.substring( 0, index ); +          html = index < 0 ? "" : html.substring( index ); + +          if ( handler.chars ) +            handler.chars( text ); +        } + +      } else { +        html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>"), function(all, text){ +          text = text.replace(/<!--(.*?)-->/g, "$1") +            .replace(/<!\[CDATA\[(.*?)]]>/g, "$1"); + +          if ( handler.chars ) +            handler.chars( text ); + +          return ""; +        }); + +        parseEndTag( "", stack.last() ); +      } + +      if ( html == last ) +        throw "Parse Error: " + html; +      last = html; +    } + +    // Clean up any remaining tags +    parseEndTag(); + +    function parseStartTag( tag, tagName, rest, unary ) { +      if ( block[ tagName ] ) { +        while ( stack.last() && inline[ stack.last() ] ) { +          parseEndTag( "", stack.last() ); +        } +      } + +      if ( closeSelf[ tagName ] && stack.last() == tagName ) { +        parseEndTag( "", tagName ); +      } + +      unary = empty[ tagName ] || !!unary; + +      if ( !unary ) +        stack.push( tagName ); + +      if ( handler.start ) { +        var attrs = []; + +        rest.replace(attr, function(match, name) { +          var value = arguments[2] ? arguments[2] : +            arguments[3] ? arguments[3] : +            arguments[4] ? arguments[4] : +            fillAttrs[name] ? name : ""; + +          attrs.push({ +            name: name, +            value: value, +            escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //" +          }); +        }); + +        if ( handler.start ) +          handler.start( tagName, attrs, unary ); +      } +    } + +    function parseEndTag( tag, tagName ) { +      // If no tag name is provided, clean shop +      if ( !tagName ) +        var pos = 0; + +      // Find the closest opened tag of the same type +      else +        for ( var pos = stack.length - 1; pos >= 0; pos-- ) +          if ( stack[ pos ] == tagName ) +            break; + +      if ( pos >= 0 ) { +        // Close all the open elements, up the stack +        for ( var i = stack.length - 1; i >= pos; i-- ) +          if ( handler.end ) +            handler.end( stack[ i ] ); + +        // Remove the open elements from the stack +        stack.length = pos; +      } +    } +  }; + +  this.HTMLtoXML = function( html ) { +    var results = ""; + +    htmlParser(html, { +      start: function( tag, attrs, unary ) { +        results += "<" + tag; + +        for ( var i = 0; i < attrs.length; i++ ) +          results += " " + attrs[i].name + '="' + attrs[i].escaped + '"'; + +        results += (unary ? "/" : "") + ">"; +      }, +      end: function( tag ) { +        results += "</" + tag + ">"; +      }, +      chars: function( text ) { +        results += text; +      }, +      comment: function( text ) { +        results += "<!--" + text + "-->"; +      } +    }); + +    return results; +  }; + +  this.HTMLtoDOM = function( html, doc ) { +    // There can be only one of these elements +    var one = makeMap("html,head,body,title"); + +    // Enforce a structure for the document +    var structure = { +      link: "head", +      base: "head" +    }; + +    if ( !doc ) { +      if ( typeof DOMDocument != "undefined" ) +        doc = new DOMDocument(); +      else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument ) +        doc = document.implementation.createDocument("", "", null); +      else if ( typeof ActiveX != "undefined" ) +        doc = new ActiveXObject("Msxml.DOMDocument"); + +    } else +      doc = doc.ownerDocument || +        doc.getOwnerDocument && doc.getOwnerDocument() || +        doc; + +    var elems = [], +      documentElement = doc.documentElement || +        doc.getDocumentElement && doc.getDocumentElement(); + +    // If we're dealing with an empty document then we +    // need to pre-populate it with the HTML document structure +    if ( !documentElement && doc.createElement ) (function(){ +      var html = doc.createElement("html"); +      var head = doc.createElement("head"); +      head.appendChild( doc.createElement("title") ); +      html.appendChild( head ); +      html.appendChild( doc.createElement("body") ); +      doc.appendChild( html ); +    })(); + +    // Find all the unique elements +    if ( doc.getElementsByTagName ) +      for ( var i in one ) +        one[ i ] = doc.getElementsByTagName( i )[0]; + +    // If we're working with a document, inject contents into +    // the body element +    var curParentNode = one.body; + +    htmlParser( html, { +      start: function( tagName, attrs, unary ) { +        // If it's a pre-built element, then we can ignore +        // its construction +        if ( one[ tagName ] ) { +          curParentNode = one[ tagName ]; +          return; +        } + +        var elem = doc.createElement( tagName ); + +        for ( var attr in attrs ) +          elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value ); + +        if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" ) +          one[ structure[ tagName ] ].appendChild( elem ); + +        else if ( curParentNode && curParentNode.appendChild ) +          curParentNode.appendChild( elem ); + +        if ( !unary ) { +          elems.push( elem ); +          curParentNode = elem; +        } +      }, +      end: function( tag ) { +        elems.length -= 1; + +        // Init the new parentNode +        curParentNode = elems[ elems.length - 1 ]; +      }, +      chars: function( text ) { +        curParentNode.appendChild( doc.createTextNode( text ) ); +      }, +      comment: function( text ) { +        // create comment node +      } +    }); + +    return doc; +  }; + +  function makeMap(str){ +    var obj = {}, items = str.split(","); +    for ( var i = 0; i < items.length; i++ ) +      obj[ items[i] ] = true; +    return obj; +  } +})();
\ No newline at end of file | 
