From df153ba45fffa47f1bff7a4201d5fd16fc7b3445 Mon Sep 17 00:00:00 2001
From: Amaury Levé
Date: Tue, 12 Jun 2018 16:26:16 +0200
Subject: Tokenize CSS (#40)

---
 .../src/main/java/org/sonar/css/plugin/Token.java  |  48 +++
 .../main/java/org/sonar/css/plugin/Tokenizer.java  | 134 ++++++++
 sonar-css-plugin/src/main/resources/tokenize.js    | 374 +++++++++++++++++++++
 3 files changed, 556 insertions(+)
 create mode 100644 sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java
 create mode 100644 sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java
 create mode 100644 sonar-css-plugin/src/main/resources/tokenize.js

(limited to 'sonar-css-plugin/src/main')
diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java
new file mode 100644
index 0000000..dc9af61
--- /dev/null
+++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java
@@ -0,0 +1,48 @@
+/*
+ * SonarCSS
+ * Copyright (C) 2018-2018 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+package org.sonar.css.plugin;
+
+public class Token {
+
+  public enum Type {
+    COMMENT,
+    STRING,
+    WORD,
+    AT_WORD,
+    BRACKETS,
+    PUNCTUATOR
+  }
+
+  Type type;
+  String text;
+  Integer startLine;
+  Integer startColumn;
+  Integer endLine;
+  Integer endColumn;
+
+  public Token(Type type, String text, Integer startLine, Integer startColumn, Integer endLine, Integer endColumn) {
+    this.text = text;
+    this.type = type;
+    this.startLine = startLine;
+    this.startColumn = startColumn;
+    this.endLine = endLine;
+    this.endColumn = endColumn;
+  }
+}
diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java
new file mode 100644
index 0000000..8f03492
--- /dev/null
+++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java
@@ -0,0 +1,134 @@
+/*
+ * SonarCSS
+ * Copyright (C) 2018-2018 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+package org.sonar.css.plugin;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import javax.script.ScriptEngine;
+import javax.script.ScriptEngineManager;
+import javax.script.ScriptException;
+import org.sonar.api.internal.apachecommons.lang.StringEscapeUtils;
+import org.sonar.css.plugin.Token.Type;
+
+public class Tokenizer {
+
+  public List<Token> tokenize(String css) throws ScriptException {
+    ScriptEngineManager factory = new ScriptEngineManager();
+    ScriptEngine engine = factory.getEngineByName("JavaScript");
+    InputStream tokenizeScript = Tokenizer.class.getClassLoader().getResourceAsStream("tokenize.js");
+    engine.eval(new InputStreamReader(tokenizeScript, StandardCharsets.UTF_8));
+    String cssInput = "tokenize('" + StringEscapeUtils.escapeJavaScript(css) + "')";
+    Object tokens = engine.eval(cssInput);
+    return extractTokens(tokens);
+  }
+
+  private static List<Token> extractTokens(Object tokens) {
+    // tokens is result of call to javascript function tokenize(). It returns an array of arrays, where nested arrays
+    // correspond to tokens. These array javascript objects mapped in Java to Map objects where array index is key.
+
+    List<Token> resultList = new ArrayList<>();
+    for (Object tokenObject : ((Map<String, Object>) tokens).values()) {
+
+      // Access the inner arrays (disregard the keys) and use their length to decide which type of token we are
+      // dealing with.
+      Map<String, Object> tokenProperties = (Map<String, Object>) tokenObject;
+
+      // skip whitespace token (size < 4)
+      if (tokenProperties.size() >= 4) {
+        String text = tokenProperties.get("1").toString();
+        Type type = computeType(tokenProperties.get("0").toString(), text);
+        Integer startLine = convertToInt(tokenProperties.get("2"));
+        Integer startColumn = ((Double) tokenProperties.get("3")).intValue();
+
+        // all cases except for punctuator type
+        if (tokenProperties.size() == 6) {
+          Integer endLine = convertToInt(tokenProperties.get("4"));
+          Integer endColumn = ((Double) tokenProperties.get("5")).intValue();
+
+
+          if (isTokenWithPunctuator(text, ",", startLine, endLine)) {
+            resultList.addAll(splitTokenWithPunctuator(text, type, startLine, startColumn, endLine, endColumn));
+          } else if (isTokenWithPunctuator(text, ":", startLine, endLine)) {
+            resultList.addAll(splitTokenWithPunctuator(text, type, startLine, startColumn, endLine, endColumn));
+          } else {
+            resultList.add(new Token(type, text, startLine, startColumn, endLine, endColumn));
+          }
+        } else {
+          // is punctuator
+          resultList.add(new Token(type, text, startLine, startColumn, startLine, startColumn));
+        }
+      }
+    }
+
+    return resultList;
+  }
+
+  // Javascript tokenizer is not returning 2 tokens for words ending with a comma (e.g. foo,) and for words starting
+  // with at symbol and endings with colon (e.g. @base:) so we need to split the word into 2 tokens (1 word without
+  // the punctuator and 1 punctuator).
+  // For the sake of simplicity we don't handle words ending with the punctuator on a new line.
+  private static Boolean isTokenWithPunctuator(String text, String punctuator, Integer startLine, Integer endLine) {
+    return text.length() > 1 && text.endsWith(punctuator) && startLine.equals(endLine);
+  }
+
+  private static List<Token> splitTokenWithPunctuator(String text, Type type, Integer startLine, Integer startColumn, Integer endLine, Integer endColumn) {
+    List<Token> tokenList = new ArrayList<>();
+
+    tokenList.add(new Token(type, text.substring(0, text.length() - 1), startLine, startColumn, endLine, endColumn - 1));
+    tokenList.add(new Token(Type.PUNCTUATOR, text.substring(text.length() - 1), startLine, endColumn, endLine, endColumn));
+
+    return tokenList;
+  }
+
+  private static Integer convertToInt(Object value) {
+    if (value instanceof Double) {
+      return ((Double) value).intValue();
+    } else if (value instanceof Integer) {
+      return  (Integer) value;
+    } else {
+      throw new IllegalStateException("Failed to convert to number: " + value);
+    }
+  }
+
+  private static Type computeType(String type, String text) {
+    switch (type) {
+      case "at-word":
+        return Type.AT_WORD;
+      case "word":
+        if (",".equals(text)) {
+          return Type.PUNCTUATOR;
+        } else {
+          return Type.WORD;
+        }
+      case "comment":
+        return Type.COMMENT;
+      case "string":
+        return Type.STRING;
+      case "brackets":
+        return Type.BRACKETS;
+      default:
+        return Type.PUNCTUATOR;
+    }
+  }
+}
diff --git a/sonar-css-plugin/src/main/resources/tokenize.js b/sonar-css-plugin/src/main/resources/tokenize.js
new file mode 100644
index 0000000..bbf9b1b
--- /dev/null
+++ b/sonar-css-plugin/src/main/resources/tokenize.js
@@ -0,0 +1,374 @@
+/*
+ * SonarCSS
+ * Copyright (C) 2018-2018 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright 2013 Andrey Sitnik <andrey@sitnik.ru>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+var SINGLE_QUOTE      = '\''.charCodeAt(0);
+var DOUBLE_QUOTE      =  '"'.charCodeAt(0);
+var BACKSLASH         = '\\'.charCodeAt(0);
+var SLASH             =  '/'.charCodeAt(0);
+var NEWLINE           = '\n'.charCodeAt(0);
+var SPACE             =  ' '.charCodeAt(0);
+var FEED              = '\f'.charCodeAt(0);
+var TAB               = '\t'.charCodeAt(0);
+var CR                = '\r'.charCodeAt(0);
+var OPEN_SQUARE       =  '['.charCodeAt(0);
+var CLOSE_SQUARE      =  ']'.charCodeAt(0);
+var OPEN_PARENTHESES  =  '('.charCodeAt(0);
+var CLOSE_PARENTHESES =  ')'.charCodeAt(0);
+var OPEN_CURLY        =  '{'.charCodeAt(0);
+var CLOSE_CURLY       =  '}'.charCodeAt(0);
+var SEMICOLON         =  ';'.charCodeAt(0);
+var ASTERISK          =  '*'.charCodeAt(0);
+var COLON             =  ':'.charCodeAt(0);
+var AT                =  '@'.charCodeAt(0);
+
+var RE_AT_END      = /[ \n\t\r\f\{\(\)'"\\;/\[\]#]/g;
+var RE_WORD_END    = /[ \n\t\r\f\(\)\{\}:;@!'"\\\]\[#]|\/(?=\*)/g;
+var RE_BAD_BRACKET = /.[\\\/\("'\n]/;
+var RE_HEX_ESCAPE  = /[a-f0-9]/i;
+
+function tokenize(css) {
+	var input = {css: css, error: function() {}};
+	var processor = tokenizer(input);
+    var tokens = [];
+    while (!processor.endOfFile()) {
+        tokens.push(processor.nextToken());
+    }
+    return tokens;
+}
+
+function tokenizer(input ) {
+	var options = {};
+    var css = input.css.valueOf();
+    var ignore = options.ignoreErrors;
+
+    var code, next, quote, lines, last, content, escape,
+        nextLine, nextOffset, escaped, escapePos, prev, n, currentToken;
+
+    var length = css.length;
+    var offset = -1;
+    var line = 1;
+    var pos = 0;
+    var buffer = [];
+    var returned = [];
+
+    function unclosed(what) {
+        throw input.error('Unclosed ' + what, line, pos - offset);
+    }
+
+    function endOfFile() {
+        return returned.length === 0 && pos >= length;
+    }
+
+    function nextToken() {
+        if ( returned.length ) return returned.pop();
+        if ( pos >= length ) return;
+
+        code = css.charCodeAt(pos);
+        if ( code === NEWLINE || code === FEED ||
+             code === CR && css.charCodeAt(pos + 1) !== NEWLINE ) {
+            offset = pos;
+            line += 1;
+        }
+
+        switch ( code ) {
+        case NEWLINE:
+        case SPACE:
+        case TAB:
+        case CR:
+        case FEED:
+            next = pos;
+            do {
+                next += 1;
+                code = css.charCodeAt(next);
+                if ( code === NEWLINE ) {
+                    offset = next;
+                    line += 1;
+                }
+            } while ( code === SPACE   ||
+                      code === NEWLINE ||
+                      code === TAB     ||
+                      code === CR      ||
+                      code === FEED );
+
+            currentToken = ['space', css.slice(pos, next)];
+            pos = next - 1;
+            break;
+
+        case OPEN_SQUARE:
+            currentToken = ['[', '[', line, pos - offset];
+            break;
+
+        case CLOSE_SQUARE:
+            currentToken = [']', ']', line, pos - offset];
+            break;
+
+        case OPEN_CURLY:
+            currentToken = ['{', '{', line, pos - offset];
+            break;
+
+        case CLOSE_CURLY:
+            currentToken = ['}', '}', line, pos - offset];
+            break;
+
+        case COLON:
+            currentToken = [':', ':', line, pos - offset];
+            break;
+
+        case SEMICOLON:
+            currentToken = [';', ';', line, pos - offset];
+            break;
+
+        case OPEN_PARENTHESES:
+            prev = buffer.length ? buffer.pop()[1] : '';
+            n    = css.charCodeAt(pos + 1);
+            if ( prev === 'url' &&
+                 n !== SINGLE_QUOTE && n !== DOUBLE_QUOTE &&
+                 n !== SPACE && n !== NEWLINE && n !== TAB &&
+                 n !== FEED && n !== CR ) {
+                next = pos;
+                do {
+                    escaped = false;
+                    next    = css.indexOf(')', next + 1);
+                    if ( next === -1 ) {
+                        if ( ignore ) {
+                            next = pos;
+                            break;
+                        } else {
+                            unclosed('bracket');
+                        }
+                    }
+                    escapePos = next;
+                    while ( css.charCodeAt(escapePos - 1) === BACKSLASH ) {
+                        escapePos -= 1;
+                        escaped = !escaped;
+                    }
+                } while ( escaped );
+
+                currentToken = ['brackets', css.slice(pos, next + 1),
+                    line, pos  - offset,
+                    line, next - offset
+                ];
+
+                pos = next;
+
+            } else {
+                next    = css.indexOf(')', pos + 1);
+                content = css.slice(pos, next + 1);
+
+                if ( next === -1 || RE_BAD_BRACKET.test(content) ) {
+                    currentToken = ['(', '(', line, pos - offset];
+                } else {
+                    currentToken = ['brackets', content,
+                        line, pos  - offset,
+                        line, next - offset
+                    ];
+                    pos = next;
+                }
+            }
+
+            break;
+
+        case CLOSE_PARENTHESES:
+            currentToken = [')', ')', line, pos - offset];
+            break;
+
+        case SINGLE_QUOTE:
+        case DOUBLE_QUOTE:
+            quote = code === SINGLE_QUOTE ? '\'' : '"';
+            next  = pos;
+            do {
+                escaped = false;
+                next    = css.indexOf(quote, next + 1);
+                if ( next === -1 ) {
+                    if ( ignore ) {
+                        next = pos + 1;
+                        break;
+                    } else {
+                        unclosed('string');
+                    }
+                }
+                escapePos = next;
+                while ( css.charCodeAt(escapePos - 1) === BACKSLASH ) {
+                    escapePos -= 1;
+                    escaped = !escaped;
+                }
+            } while ( escaped );
+
+            content = css.slice(pos, next + 1);
+            lines   = content.split('\n');
+            last    = lines.length - 1;
+
+            if ( last > 0 ) {
+                nextLine   = line + last;
+                nextOffset = next - lines[last].length;
+            } else {
+                nextLine   = line;
+                nextOffset = offset;
+            }
+
+            currentToken = ['string', css.slice(pos, next + 1),
+                line, pos  - offset,
+                nextLine, next - nextOffset
+            ];
+
+            offset = nextOffset;
+            line   = nextLine;
+            pos    = next;
+            break;
+
+        case AT:
+            RE_AT_END.lastIndex = pos + 1;
+            RE_AT_END.test(css);
+            if ( RE_AT_END.lastIndex === 0 ) {
+                next = css.length - 1;
+            } else {
+                next = RE_AT_END.lastIndex - 2;
+            }
+
+            currentToken = ['at-word', css.slice(pos, next + 1),
+                line, pos  - offset,
+                line, next - offset
+            ];
+
+            pos = next;
+            break;
+
+        case BACKSLASH:
+            next   = pos;
+            escape = true;
+            while ( css.charCodeAt(next + 1) === BACKSLASH ) {
+                next  += 1;
+                escape = !escape;
+            }
+            code = css.charCodeAt(next + 1);
+            if ( escape && (code !== SLASH   &&
+                            code !== SPACE   &&
+                            code !== NEWLINE &&
+                            code !== TAB     &&
+                            code !== CR      &&
+                            code !== FEED ) ) {
+                next += 1;
+                if ( RE_HEX_ESCAPE.test(css.charAt(next)) ) {
+                    while ( RE_HEX_ESCAPE.test(css.charAt(next + 1)) ) {
+                        next += 1;
+                    }
+                    if ( css.charCodeAt(next + 1) === SPACE ) {
+                        next += 1;
+                    }
+                }
+            }
+
+            currentToken = ['word', css.slice(pos, next + 1),
+                line, pos  - offset,
+                line, next - offset
+            ];
+
+            pos = next;
+            break;
+
+        default:
+            if ( code === SLASH && css.charCodeAt(pos + 1) === ASTERISK ) {
+                next = css.indexOf('*/', pos + 2) + 1;
+                if ( next === 0 ) {
+                    if ( ignore ) {
+                        next = css.length;
+                    } else {
+                        unclosed('comment');
+                    }
+                }
+
+                content = css.slice(pos, next + 1);
+                lines   = content.split('\n');
+                last    = lines.length - 1;
+
+                if ( last > 0 ) {
+                    nextLine   = line + last;
+                    nextOffset = next - lines[last].length;
+                } else {
+                    nextLine   = line;
+                    nextOffset = offset;
+                }
+
+                currentToken = ['comment', content,
+                    line,     pos  - offset,
+                    nextLine, next - nextOffset
+                ];
+
+                offset = nextOffset;
+                line   = nextLine;
+                pos    = next;
+
+            } else {
+                RE_WORD_END.lastIndex = pos + 1;
+                RE_WORD_END.test(css);
+                if ( RE_WORD_END.lastIndex === 0 ) {
+                    next = css.length - 1;
+                } else {
+                    next = RE_WORD_END.lastIndex - 2;
+                }
+
+                currentToken = ['word', css.slice(pos, next + 1),
+                    line, pos  - offset,
+                    line, next - offset
+                ];
+
+                buffer.push(currentToken);
+
+                pos = next;
+            }
+
+            break;
+        }
+
+        pos++;
+        return currentToken;
+    }
+
+    function back(token) {
+        returned.push(token);
+    }
+
+    return {
+        back:back, nextToken:nextToken, endOfFile:endOfFile
+    };
+}
-- 
cgit v1.2.3