Improve regex to avoid backtrack and to use non-capturing groups (#110)

author: Amaury Levé 2018-07-25 15:09:55 +0200
committer: GitHub 2018-07-25 15:09:55 +0200
commit: 58937179bf180daf93d4cf67d00d3d09fd3c1c3f (patch)
tree: 529c0ae4bcdc7347bb4e71451586fe6a87a875c1
parent: 70768055ca35c7f8e82f6436295cf1e96b25afa7 (diff)
download: sonar-css-58937179bf180daf93d4cf67d00d3d09fd3c1c3f.tar.bz2
2 files changed, 40 insertions, 24 deletions
diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssLexer.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssLexer.java
index 9a4bb58..7aa1200 100644
--- a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssLexer.java
+++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssLexer.java
@@ -27,33 +27,36 @@ import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.regexp;
 // It is far from being entirely matching the standard definition of css/less/scss tokens nor
 // following the theory of what a lexer responsibilities are but as we are only building line metrics and highlighting
 // on top of it we decided to focus on simplicity over being extensive.
+
+// Be careful to avoid/limit usage of backtracking regex. There is nearly always an alternative with a forward lookup.
+// This will allow to improve performance and avoid a lof of StackOverflowException.
 public final class CssLexer {
 
-  private static final String NEW_LINE = "(\r\n|\r|\n|\f)";
+  private static final String NEW_LINE = "(?:\r\n|\r|\n|\f)";
   private static final String WHITESPACE = "[\t\n\f\r ]";
   private static final String NON_ASCII = "[^\\p{ASCII}]";
   private static final String HEX_DIGIT = "0-9a-fA-F";
-  private static final String ESCAPE = "(\\\\[" + HEX_DIGIT + "]{1,6}" + WHITESPACE + "?)|\\[^\r\n\f" + HEX_DIGIT + "]";
+  private static final String ESCAPE = "(?:\\\\[" + HEX_DIGIT + "]{1,6}" + WHITESPACE + "?)|\\[^\r\n\f" + HEX_DIGIT + "]";
 
   private static final String PUNCTUATOR = "[!:,;%&+#\\*-/=>\\(\\)\\[\\]\\{\\}]";
 
-  private static final String MULTI_LINE_COMMENT = "/\\*(.|" + NEW_LINE + ")*?\\*/";
-  private static final String INLINE_COMMENT = "//.*";
-  private static final String COMMENT  = "(" + INLINE_COMMENT + "|" + MULTI_LINE_COMMENT + ")";
+  // Use dotall mode (https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#DOTALL) to match line return
+  // while using .
+  private static final String MULTI_LINE_COMMENT = "(?s)/\\*.*?\\*/";
+  private static final String INLINE_COMMENT = "//[^\n\r\f]*+";
 
-  private static final String NUMBER = "[+|-]?\\d*\\.?\\d+([a-z]+|%)?";
+  private static final String NUMBER = "[+|-]?+(?:\\d++(?:.\\d++)?+|\\.\\d++)(?:[a-z]++|%)?+";
 
   private static final String NAME_CHAR = "[a-zA-Z0-9_-]|" + NON_ASCII + "|" + ESCAPE;
   private static final String NAME_START = "[a-zA-Z_]|" + NON_ASCII + "|" + ESCAPE;
 
-  private static final String IDENTIFIER = "-?(" + NAME_START + ")(" + NAME_CHAR + ")*";
-  private static final String AT_IDENTIFIER = "@+" + IDENTIFIER;
-  private static final String HASH_IDENTIFIER = "#(" + NAME_CHAR + ")+";
-  private static final String DOLLAR_IDENTIFIER = "\\$(" + NAME_CHAR + ")+";
+  private static final String IDENTIFIER = "-?+(?:" + NAME_START + ")(?:" + NAME_CHAR + ")*+";
+  private static final String AT_IDENTIFIER = "@++" + IDENTIFIER;
+  private static final String HASH_IDENTIFIER = "#(?:" + NAME_CHAR + ")++";
+  private static final String DOLLAR_IDENTIFIER = "\\$(?:" + NAME_CHAR + ")++";
 
-  private static final String DOUBLE_QUOTE_STRING = "~?\"([^\"\\\\\r\n\f]|" + ESCAPE + "|\\\\" + NEW_LINE + ")*\"";
-  private static final String SINGLE_QUOTE_STRING = "~?'([^'\\\\\r\n\f]|" + ESCAPE + "|\\\\" + NEW_LINE + ")*'";
-  private static final String STRING = "(" + SINGLE_QUOTE_STRING + "|" + DOUBLE_QUOTE_STRING + ")";
+  private static final String DOUBLE_QUOTE_STRING = "~?+\"(?:[^\"\\\\\r\n\f]|" + ESCAPE + "|\\\\" + NEW_LINE + ")*+\"";
+  private static final String SINGLE_QUOTE_STRING = "~?+'(?:[^'\\\\\r\n\f]|" + ESCAPE + "|\\\\" + NEW_LINE + ")*+'";
 
   private CssLexer() {
   }
@@ -62,8 +65,10 @@ public final class CssLexer {
     return Lexer.builder()
       .withFailIfNoChannelToConsumeOneCharacter(false)
 
-      .withChannel(regexp(CssTokenType.COMMENT, COMMENT))
-      .withChannel(regexp(CssTokenType.STRING, STRING))
+      .withChannel(regexp(CssTokenType.COMMENT, MULTI_LINE_COMMENT))
+      .withChannel(regexp(CssTokenType.COMMENT, INLINE_COMMENT))
+      .withChannel(regexp(CssTokenType.STRING, DOUBLE_QUOTE_STRING))
+      .withChannel(regexp(CssTokenType.STRING, SINGLE_QUOTE_STRING))
       .withChannel(regexp(CssTokenType.AT_IDENTIFIER, AT_IDENTIFIER))
       .withChannel(regexp(CssTokenType.HASH_IDENTIFIER, HASH_IDENTIFIER))
       .withChannel(regexp(CssTokenType.DOLLAR_IDENTIFIER, DOLLAR_IDENTIFIER))
diff --git a/sonar-css-plugin/src/test/java/org/sonar/css/plugin/TokenizerTest.java b/sonar-css-plugin/src/test/java/org/sonar/css/plugin/TokenizerTest.java
index a0cf508..a9b5e36 100644
--- a/sonar-css-plugin/src/test/java/org/sonar/css/plugin/TokenizerTest.java
+++ b/sonar-css-plugin/src/test/java/org/sonar/css/plugin/TokenizerTest.java
@@ -20,10 +20,9 @@
 package org.sonar.css.plugin;
 
 import java.util.List;
+import org.apache.commons.lang.StringUtils;
 import org.junit.Test;
 
-import java.util.List;
-
 import static org.assertj.core.api.Assertions.assertThat;
 
 public class TokenizerTest {
@@ -78,6 +77,7 @@ public class TokenizerTest {
   public void number() {
     assertToken("1.15", 0, "1.15", CssTokenType.NUMBER);
     assertToken("1", 0, "1", CssTokenType.NUMBER);
+    assertToken(".1", 0, ".1", CssTokenType.NUMBER);
     assertToken("1.15px", 0, "1.15px", CssTokenType.NUMBER);
     assertToken("1.15%", 0, "1.15%", CssTokenType.NUMBER);
     assertToken("1px", 0, "1px", CssTokenType.NUMBER);
@@ -103,22 +103,33 @@ public class TokenizerTest {
     assertToken("bar { foo: \"\"; }", 4, "\"\"", CssTokenType.STRING);
     assertToken("\"foo\\\nbar\"", 0, "\"foo\\\nbar\"", CssTokenType.STRING);
     assertToken("@min768: ~\"(min-width: 768px)\"", 2, "~\"(min-width: 768px)\"", CssTokenType.STRING);
+
+    int numberOfChars = 1000000;
+    String seedCode = StringUtils.repeat("a", numberOfChars);
+
+    String testCode = "\"" + seedCode + "\"";
+    assertToken(testCode, 0, testCode, CssTokenType.STRING, 1, 0, 1, testCode.length());
+    testCode = "'" + seedCode + "'";
+    assertToken(testCode, 0, testCode, CssTokenType.STRING, 1, 0, 1, testCode.length());
   }
 
   @Test
   public void comment() {
     assertToken("/* foo */", 0, "/* foo */", CssTokenType.COMMENT);
     assertToken("foo { a: /* foo */ 42; }", 4, "/* foo */", CssTokenType.COMMENT);
-    assertToken("/* \n"
-      + "  this is a comment\n"
-      + "  and it is awesome because\n"
-      + "  it is multiline!\n"
-      + "*/", 0, "/* \n"
+    assertToken("foo { a: /* foo\nbar*/ 42; }", 4, "/* foo\nbar*/", CssTokenType.COMMENT, 1, 9, 2, 5);
+    assertToken("foo { a: /* foo\r\nbar*/ 42; }", 4, "/* foo\r\nbar*/", CssTokenType.COMMENT, 1, 9, 2, 5);
+    assertToken("foo { a: /* foo\fbar*/ 42; }", 4, "/* foo\fbar*/", CssTokenType.COMMENT, 1, 9, 1, 21);
+    String code = "/* \n"
       + "  this is a comment\n"
       + "  and it is awesome because\n"
       + "  it is multiline!\n"
-      + "*/", CssTokenType.COMMENT, 1, 0, 5, 2);
-    assertToken("foo { a: /* foo\nbar*/ 42; }", 4, "/* foo\nbar*/", CssTokenType.COMMENT, 1, 9, 2, 5);
+      + "*/";
+    assertToken(code, 0, code, CssTokenType.COMMENT, 1, 0, 5, 2);
+
+    int numberOfLineReturn = 1000000;
+    code = "/*" + StringUtils.repeat(" *\n", numberOfLineReturn) + " */";
+    assertToken(code, 0, code, CssTokenType.COMMENT, 1, 0, numberOfLineReturn + 1, 3);
   }
 
   @Test
author	Amaury Levé	2018-07-25 15:09:55 +0200
committer	GitHub	2018-07-25 15:09:55 +0200
commit	58937179bf180daf93d4cf67d00d3d09fd3c1c3f (patch)
tree	529c0ae4bcdc7347bb4e71451586fe6a87a875c1
parent	70768055ca35c7f8e82f6436295cf1e96b25afa7 (diff)
download	sonar-css-58937179bf180daf93d4cf67d00d3d09fd3c1c3f.tar.bz2