Refactor lexer to use regular expressions

author: Misko Hevery 2010-12-07 11:42:34 -0800
committer: Misko Hevery 2010-12-08 14:39:22 -0800
commit: 23fc73081feb640164615930b36ef185c23a3526 (patch)
tree: 3354648159e348e97ba4b49c005d7c9e64a343bd
parent: e5e69d9b90850eb653883f52c76e28dd870ee067 (diff)
download: angular.js-23fc73081feb640164615930b36ef185c23a3526.tar.bz2
2 files changed, 56 insertions, 103 deletions
diff --git a/src/parser.js b/src/parser.js
index 01edb3f1..fec23899 100644
--- a/src/parser.js
+++ b/src/parser.js
@@ -32,7 +32,7 @@ function lex(text, parseStringsForObjects){
       index = 0,
       json = [],
       ch,
-      lastCh = ':'; // can start regexp
+      lastCh = ':';
 
   while (index < text.length) {
     ch = text.charAt(index);
@@ -71,6 +71,9 @@ function lex(text, parseStringsForObjects){
     lastCh = ch;
   }
   return tokens;
+  
+  
+  //////////////////////////////////////////////
 
   function is(chars) {
     return chars.indexOf(ch) != -1;
@@ -95,10 +98,6 @@ function lex(text, parseStringsForObjects){
            'A' <= ch && ch <= 'Z' ||
            '_' == ch || ch == '$';
   }
-  function isExpOperator(ch) {
-    return ch == '-' || ch == '+' || isNumber(ch);
-  }
-
   function throwError(error, start, end) {
     end = end || index;
     throw Error("Lexer Error: " + error + " at column" +
@@ -107,103 +106,61 @@ function lex(text, parseStringsForObjects){
             " " + end) + 
         " in expression [" + text + "].");
   }
+  
+  function consume(regexp, processToken, errorMsg) {
+    var match = text.substr(index).match(regexp);
+    var token = {index: index};
+    var start = index;
+    if (!match) throwError(errorMsg);
+    index += match[0].length;
+    processToken(token, token.text = match[0], start);
+    tokens.push(token);
+  }
 
   function readNumber() {
-    var number = "";
-    var start = index;
-    while (index < text.length) {
-      var ch = lowercase(text.charAt(index));
-      if (ch == '.' || isNumber(ch)) {
-        number += ch;
-      } else {
-        var peekCh = peek();
-        if (ch == 'e' && isExpOperator(peekCh)) {
-          number += ch;
-        } else if (isExpOperator(ch) &&
-            peekCh && isNumber(peekCh) &&
-            number.charAt(number.length - 1) == 'e') {
-          number += ch;
-        } else if (isExpOperator(ch) &&
-            (!peekCh || !isNumber(peekCh)) &&
-            number.charAt(number.length - 1) == 'e') {
-          throwError('Invalid exponent');
-        } else {
-          break;
-        }
-      }
-      index++;
-    }
-    number = 1 * number;
-    tokens.push({index:start, text:number, json:true,
-      fn:function(){return number;}});
+    consume(/^(\d+)?(\.\d+)?([eE][+-]?\d+)?/, function(token, number){
+      token.text = number = 1 * number;
+      token.json = true;
+      token.fn = valueFn(number);
+    }, "Not a valid number");
   }
+  
   function readIdent() {
-    var ident = "";
-    var start = index;
-    var fn;
-    while (index < text.length) {
-      var ch = text.charAt(index);
-      if (ch == '.' || isIdent(ch) || isNumber(ch)) {
-        ident += ch;
-      } else {
-        break;
+    consume(/^[\w_\$][\w_\$\d]*(\.[\w_\$][\w_\$\d]*)*/, function(token, ident){
+      fn = OPERATORS[ident];
+      if (!fn) {
+        fn = getterFn(ident);
+        fn.isAssignable = ident;
       }
-      index++;
-    }
-    fn = OPERATORS[ident];
-    tokens.push({
-      index:start, 
-      text:ident, 
-      json: fn,
-      fn:fn||extend(getterFn(ident), {
+      token.fn = OPERATORS[ident]||extend(getterFn(ident), {
         assign:function(self, value){
           return setter(self, ident, value);
         }
-      })
+      });
+      token.json = OPERATORS[ident];
     });
   }
   
   function readString(quote) {
-    var start = index;
-    index++;
-    var string = "";
-    var rawString = quote;
-    var escape = false;
-    while (index < text.length) {
-      var ch = text.charAt(index);
-      rawString += ch;
-      if (escape) {
-        if (ch == 'u') {
-          var hex = text.substring(index + 1, index + 5);
-          if (!hex.match(/[\da-f]{4}/i))
-            throwError( "Invalid unicode escape [\\u" + hex + "]");
-          index += 4;
-          string += String.fromCharCode(parseInt(hex, 16));
-        } else {
-          var rep = ESCAPE[ch];
-          if (rep) {
-            string += rep;
-          } else {
-            string += ch;
-          }
-        }
-        escape = false;
-      } else if (ch == '\\') {
-        escape = true;
-      } else if (ch == quote) {
-        index++;
-        tokens.push({index:start, text:rawString, string:string, json:true,
-          fn:function(){
-            return (string.length == dateParseLength) ?
-              angular['String']['toDate'](string) : string;
-          }});
-        return;
-      } else {
-        string += ch;
-      }
-      index++;
-    }
-    throwError("Unterminated quote", start);
+    consume(/^(('(\\'|[^'])*')|("(\\"|[^"])*"))/, function(token, rawString, start){
+      var hasError;
+      var string = token.string = rawString.substr(1, rawString.length - 2).
+        replace(/(\\u(.?.?.?.?))|(\\(.))/g, 
+          function(match, wholeUnicode, unicode, wholeEscape, escape){
+            if (unicode && !unicode.match(/[\da-fA-F]{4}/))
+              hasError = hasError || bind(null, throwError, "Invalid unicode escape [\\u" + unicode + "]", start);
+            return unicode ? 
+                String.fromCharCode(parseInt(unicode, 16)) : 
+                ESCAPE[escape] || escape;
+          });
+      (hasError||noop)();
+      token.json = true;
+      token.fn = function(){
+        return (string.length == dateParseLength) ?
+            angular['String']['toDate'](string) : 
+            string;
+      };
+    }, "Unterminated string");
   }
 }
 
diff --git a/test/ParserSpec.js b/test/ParserSpec.js
index c237aa40..71208783 100644
--- a/test/ParserSpec.js
+++ b/test/ParserSpec.js
@@ -82,9 +82,15 @@ describe('parser', function() {
       expect(tokens.length).toEqual(1);
       expect(tokens[0].string).toEqual('\u00a0');
     });
+    
+    it('should error when non terminated string', function(){
+      expect(function(){
+        lex('ignore "text');
+      }).toThrow(new Error('Lexer Error: Unterminated string at column 7 in expression [ignore "text].'));
+    });
 
     it('should ignore whitespace', function() {
-      var tokens = lex("a \t \n \r b");
+      var tokens = lex("a \t \n \r \u00A0 b");
       expect(tokens[0].text).toEqual('a');
       expect(tokens[1].text).toEqual('b');
     });
@@ -130,16 +136,6 @@ describe('parser', function() {
       expect(tokens[0].text).toEqual(0.5E+10);
     });
 
-    it('should throws exception for invalid exponent', function() {
-      expect(function() {
-        lex("0.5E-");
-      }).toThrow(new Error('Lexer Error: Invalid exponent at column 4 in expression [0.5E-].'));
-      
-      expect(function() {
-        lex("0.5E-A");
-      }).toThrow(new Error('Lexer Error: Invalid exponent at column 4 in expression [0.5E-A].'));
-    });
-
     it('should tokenize number starting with a dot', function() {
       var tokens = lex(".5");
       expect(tokens[0].text).toEqual(0.5);
@@ -147,8 +143,8 @@ describe('parser', function() {
 
     it('should throw error on invalid unicode', function() {
       expect(function() {
-        lex("'\\u1''bla'");
-      }).toThrow(new Error("Lexer Error: Invalid unicode escape [\\u1''b] at column 2 in expression ['\\u1''bla']."));
+        lex("'\\u1xbla'");
+      }).toThrow(new Error("Lexer Error: Invalid unicode escape [\\u1xbl] at columns 0-9 ['\\u1xbla'] in expression ['\\u1xbla']."));
     });
   });
author	Misko Hevery	2010-12-07 11:42:34 -0800
committer	Misko Hevery	2010-12-08 14:39:22 -0800
commit	23fc73081feb640164615930b36ef185c23a3526 (patch)
tree	3354648159e348e97ba4b49c005d7c9e64a343bd
parent	e5e69d9b90850eb653883f52c76e28dd870ee067 (diff)
download	angular.js-23fc73081feb640164615930b36ef185c23a3526.tar.bz2