From c6053785e5f8f01a544cb106afd9109a6ba7d7a1 Mon Sep 17 00:00:00 2001 From: Amaury Levé Date: Thu, 21 Jun 2018 09:05:37 +0200 Subject: Improve tokenizer and highlighting --- .../main/java/org/sonar/css/plugin/CssLexer.java | 76 +++++++++ .../main/java/org/sonar/css/plugin/CssToken.java | 45 ++++++ .../java/org/sonar/css/plugin/CssTokenType.java | 49 ++++++ .../java/org/sonar/css/plugin/MetricSensor.java | 39 +++-- .../src/main/java/org/sonar/css/plugin/Token.java | 48 ------ .../main/java/org/sonar/css/plugin/Tokenizer.java | 171 +++++---------------- 6 files changed, 233 insertions(+), 195 deletions(-) create mode 100644 sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssLexer.java create mode 100644 sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssToken.java create mode 100644 sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssTokenType.java delete mode 100644 sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java (limited to 'sonar-css-plugin/src/main/java/org/sonar/css') diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssLexer.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssLexer.java new file mode 100644 index 0000000..9a4bb58 --- /dev/null +++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssLexer.java @@ -0,0 +1,76 @@ +/* + * SonarCSS + * Copyright (C) 2018-2018 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.css.plugin; + +import com.sonar.sslr.impl.Lexer; + +import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.regexp; + +// This is a at-best lexer. +// It is far from being entirely matching the standard definition of css/less/scss tokens nor +// following the theory of what a lexer responsibilities are but as we are only building line metrics and highlighting +// on top of it we decided to focus on simplicity over being extensive. +public final class CssLexer { + + private static final String NEW_LINE = "(\r\n|\r|\n|\f)"; + private static final String WHITESPACE = "[\t\n\f\r ]"; + private static final String NON_ASCII = "[^\\p{ASCII}]"; + private static final String HEX_DIGIT = "0-9a-fA-F"; + private static final String ESCAPE = "(\\\\[" + HEX_DIGIT + "]{1,6}" + WHITESPACE + "?)|\\[^\r\n\f" + HEX_DIGIT + "]"; + + private static final String PUNCTUATOR = "[!:,;%&+#\\*-/=>\\(\\)\\[\\]\\{\\}]"; + + private static final String MULTI_LINE_COMMENT = "/\\*(.|" + NEW_LINE + ")*?\\*/"; + private static final String INLINE_COMMENT = "//.*"; + private static final String COMMENT = "(" + INLINE_COMMENT + "|" + MULTI_LINE_COMMENT + ")"; + + private static final String NUMBER = "[+|-]?\\d*\\.?\\d+([a-z]+|%)?"; + + private static final String NAME_CHAR = "[a-zA-Z0-9_-]|" + NON_ASCII + "|" + ESCAPE; + private static final String NAME_START = "[a-zA-Z_]|" + NON_ASCII + "|" + ESCAPE; + + private static final String IDENTIFIER = "-?(" + NAME_START + ")(" + NAME_CHAR + ")*"; + private static final String AT_IDENTIFIER = "@+" + IDENTIFIER; + private static final String HASH_IDENTIFIER = "#(" + NAME_CHAR + ")+"; + private static final String DOLLAR_IDENTIFIER = "\\$(" + NAME_CHAR + ")+"; + + private static final String DOUBLE_QUOTE_STRING = "~?\"([^\"\\\\\r\n\f]|" + ESCAPE + "|\\\\" + NEW_LINE + ")*\""; + private static final String SINGLE_QUOTE_STRING = "~?'([^'\\\\\r\n\f]|" + ESCAPE + "|\\\\" + NEW_LINE + ")*'"; + private static final String STRING = "(" + SINGLE_QUOTE_STRING + "|" + DOUBLE_QUOTE_STRING + ")"; + + private CssLexer() { + } + + public static Lexer create() { + return Lexer.builder() + .withFailIfNoChannelToConsumeOneCharacter(false) + + .withChannel(regexp(CssTokenType.COMMENT, COMMENT)) + .withChannel(regexp(CssTokenType.STRING, STRING)) + .withChannel(regexp(CssTokenType.AT_IDENTIFIER, AT_IDENTIFIER)) + .withChannel(regexp(CssTokenType.HASH_IDENTIFIER, HASH_IDENTIFIER)) + .withChannel(regexp(CssTokenType.DOLLAR_IDENTIFIER, DOLLAR_IDENTIFIER)) + .withChannel(regexp(CssTokenType.IDENTIFIER, IDENTIFIER)) + .withChannel(regexp(CssTokenType.NUMBER, NUMBER)) + .withChannel(regexp(CssTokenType.PUNCTUATOR, PUNCTUATOR)) + + .build(); + } +} diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssToken.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssToken.java new file mode 100644 index 0000000..4ba0cc6 --- /dev/null +++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssToken.java @@ -0,0 +1,45 @@ +/* + * SonarCSS + * Copyright (C) 2018-2018 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.css.plugin; + +import com.sonar.sslr.api.Token; +import com.sonar.sslr.api.TokenType; +import org.sonarsource.analyzer.commons.TokenLocation; + +public class CssToken { + CssTokenType type; + String text; + Integer startLine; + Integer startColumn; + Integer endLine; + Integer endColumn; + + public CssToken(Token token) { + TokenType tokenType = token.getType(); + this.type = (CssTokenType)tokenType; + this.text = token.getValue(); + + TokenLocation tokenLocation = new TokenLocation(token.getLine(), token.getColumn(), token.getValue()); + this.startLine = tokenLocation.startLine(); + this.startColumn = tokenLocation.startLineOffset(); + this.endLine = tokenLocation.endLine(); + this.endColumn = tokenLocation.endLineOffset(); + } +} diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssTokenType.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssTokenType.java new file mode 100644 index 0000000..dccc5b7 --- /dev/null +++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/CssTokenType.java @@ -0,0 +1,49 @@ +/* + * SonarCSS + * Copyright (C) 2018-2018 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.css.plugin; + +import com.sonar.sslr.api.AstNode; +import com.sonar.sslr.api.TokenType; + +public enum CssTokenType implements TokenType { + COMMENT, + PUNCTUATOR, + NUMBER, + STRING, + AT_IDENTIFIER, + HASH_IDENTIFIER, + DOLLAR_IDENTIFIER, + IDENTIFIER; + + @Override + public String getName() { + return name(); + } + + @Override + public String getValue() { + return name(); + } + + @Override + public boolean hasToBeSkippedFromAst(AstNode node) { + return false; + } +} diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/MetricSensor.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/MetricSensor.java index abbbd50..6257b74 100644 --- a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/MetricSensor.java +++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/MetricSensor.java @@ -21,7 +21,6 @@ package org.sonar.css.plugin; import java.io.IOException; import java.util.List; -import javax.script.ScriptException; import org.sonar.api.batch.fs.FileSystem; import org.sonar.api.batch.fs.InputFile; import org.sonar.api.batch.sensor.Sensor; @@ -57,11 +56,11 @@ public class MetricSensor implements Sensor { private static void saveHighlights(SensorContext sensorContext, InputFile input, Tokenizer tokenizer) { try { NewHighlighting highlighting = sensorContext.newHighlighting().onFile(input); - List tokenList = tokenizer.tokenize(input.contents()); + List tokenList = tokenizer.tokenize(input.contents()); for (int i = 0; i < tokenList.size(); i++) { - Token currentToken = tokenList.get(i); - Token nextToken = i + 1 == tokenList.size() ? null : tokenList.get(i + 1); + CssToken currentToken = tokenList.get(i); + CssToken nextToken = i + 1 < tokenList.size() ? tokenList.get(i + 1) : null; TypeOfText highlightingType = null; switch (currentToken.type) { @@ -73,18 +72,32 @@ public class MetricSensor implements Sensor { highlightingType = TypeOfText.STRING; break; - case WORD: - if (Character.isDigit(currentToken.text.charAt(0)) || currentToken.text.matches("^#[0-9a-fA-F]+$")) { + case NUMBER: + highlightingType = TypeOfText.CONSTANT; + break; + + case AT_IDENTIFIER: + highlightingType = TypeOfText.ANNOTATION; + break; + + case DOLLAR_IDENTIFIER: + highlightingType = TypeOfText.KEYWORD; + break; + + case HASH_IDENTIFIER: + if (currentToken.text.matches("^#[0-9a-fA-F]+$")) { highlightingType = TypeOfText.CONSTANT; - } else if (nextToken != null && nextToken.text.equals(":")) { - highlightingType = TypeOfText.KEYWORD_LIGHT; - } else if (currentToken.text.startsWith(".") || (nextToken != null && nextToken.text.startsWith("{"))) { + } else { highlightingType = TypeOfText.KEYWORD; } break; - case AT_WORD: - highlightingType = TypeOfText.ANNOTATION; + case IDENTIFIER: + // We want to highlight the property key of a css/scss/less file and as the tokenizer is putting the ':' into another token + // we need to look for identifier followed by a PUNCTUATOR token with text ':'. + if (nextToken != null && nextToken.text.equals(":")) { + highlightingType = TypeOfText.KEYWORD_LIGHT; + } break; default: @@ -92,14 +105,12 @@ public class MetricSensor implements Sensor { } if (highlightingType != null) { - highlighting.highlight(currentToken.startLine, currentToken.startColumn - 1, currentToken.endLine, currentToken.endColumn, highlightingType); + highlighting.highlight(currentToken.startLine, currentToken.startColumn, currentToken.endLine, currentToken.endColumn, highlightingType); } } highlighting.save(); - } catch (ScriptException e) { - LOG.error(String.format("Failed to tokenize file '%s'", input.toString()), e); } catch (IOException e) { LOG.error(String.format("Failed to read file '%s'", input.toString()), e); } diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java deleted file mode 100644 index dc9af61..0000000 --- a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * SonarCSS - * Copyright (C) 2018-2018 SonarSource SA - * mailto:info AT sonarsource DOT com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ -package org.sonar.css.plugin; - -public class Token { - - public enum Type { - COMMENT, - STRING, - WORD, - AT_WORD, - BRACKETS, - PUNCTUATOR - } - - Type type; - String text; - Integer startLine; - Integer startColumn; - Integer endLine; - Integer endColumn; - - public Token(Type type, String text, Integer startLine, Integer startColumn, Integer endLine, Integer endColumn) { - this.text = text; - this.type = type; - this.startLine = startLine; - this.startColumn = startColumn; - this.endLine = endLine; - this.endColumn = endColumn; - } -} diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java index 220bfaa..cf84e08 100644 --- a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java +++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java @@ -1,133 +1,38 @@ -/* - * SonarCSS - * Copyright (C) 2018-2018 SonarSource SA - * mailto:info AT sonarsource DOT com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ -package org.sonar.css.plugin; - -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import javax.script.ScriptEngine; -import javax.script.ScriptException; -import jdk.nashorn.api.scripting.NashornScriptEngineFactory; -import org.apache.commons.lang.StringEscapeUtils; -import org.sonar.css.plugin.Token.Type; - -public class Tokenizer { - - public List tokenize(String css) throws ScriptException { - ScriptEngine engine = new NashornScriptEngineFactory().getScriptEngine(); - InputStream tokenizeScript = Tokenizer.class.getClassLoader().getResourceAsStream("tokenize.js"); - engine.eval(new InputStreamReader(tokenizeScript, StandardCharsets.UTF_8)); - String cssInput = "tokenize('" + StringEscapeUtils.escapeJavaScript(css) + "')"; - Object tokens = engine.eval(cssInput); - return extractTokens(tokens); - } - - private static List extractTokens(Object tokens) { - // tokens is result of call to javascript function tokenize(). It returns an array of arrays, where nested arrays - // correspond to tokens. These array javascript objects mapped in Java to Map objects where array index is key. - - List resultList = new ArrayList<>(); - for (Object tokenObject : ((Map) tokens).values()) { - - // Access the inner arrays (disregard the keys) and use their length to decide which type of token we are - // dealing with. - Map tokenProperties = (Map) tokenObject; - - // skip whitespace token (size < 4) - if (tokenProperties.size() >= 4) { - String text = tokenProperties.get("1").toString(); - Type type = computeType(tokenProperties.get("0").toString(), text); - Integer startLine = convertToInt(tokenProperties.get("2")); - Integer startColumn = ((Double) tokenProperties.get("3")).intValue(); - - // all cases except for punctuator type - if (tokenProperties.size() == 6) { - Integer endLine = convertToInt(tokenProperties.get("4")); - Integer endColumn = ((Double) tokenProperties.get("5")).intValue(); - - - if (isTokenWithPunctuator(text, ",", startLine, endLine)) { - resultList.addAll(splitTokenWithPunctuator(text, type, startLine, startColumn, endLine, endColumn)); - } else if (isTokenWithPunctuator(text, ":", startLine, endLine)) { - resultList.addAll(splitTokenWithPunctuator(text, type, startLine, startColumn, endLine, endColumn)); - } else { - resultList.add(new Token(type, text, startLine, startColumn, endLine, endColumn)); - } - } else { - // is punctuator - resultList.add(new Token(type, text, startLine, startColumn, startLine, startColumn)); - } - } - } - - return resultList; - } - - // Javascript tokenizer is not returning 2 tokens for words ending with a comma (e.g. foo,) and for words starting - // with at symbol and endings with colon (e.g. @base:) so we need to split the word into 2 tokens (1 word without - // the punctuator and 1 punctuator). - // For the sake of simplicity we don't handle words ending with the punctuator on a new line. - private static Boolean isTokenWithPunctuator(String text, String punctuator, Integer startLine, Integer endLine) { - return text.length() > 1 && text.endsWith(punctuator) && startLine.equals(endLine); - } - - private static List splitTokenWithPunctuator(String text, Type type, Integer startLine, Integer startColumn, Integer endLine, Integer endColumn) { - List tokenList = new ArrayList<>(); - - tokenList.add(new Token(type, text.substring(0, text.length() - 1), startLine, startColumn, endLine, endColumn - 1)); - tokenList.add(new Token(Type.PUNCTUATOR, text.substring(text.length() - 1), startLine, endColumn, endLine, endColumn)); - - return tokenList; - } - - private static Integer convertToInt(Object value) { - if (value instanceof Double) { - return ((Double) value).intValue(); - } else if (value instanceof Integer) { - return (Integer) value; - } else { - throw new IllegalStateException("Failed to convert to number: " + value); - } - } - - private static Type computeType(String type, String text) { - switch (type) { - case "at-word": - return Type.AT_WORD; - case "word": - if (",".equals(text)) { - return Type.PUNCTUATOR; - } else { - return Type.WORD; - } - case "comment": - return Type.COMMENT; - case "string": - return Type.STRING; - case "brackets": - return Type.BRACKETS; - default: - return Type.PUNCTUATOR; - } - } -} +/* + * SonarCSS + * Copyright (C) 2018-2018 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.css.plugin; + +import com.sonar.sslr.api.Token; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +public class Tokenizer { + + public List tokenize(String css) { + List tokenList = CssLexer.create().lex(css); + + // remove last token (EOF token) + List cloneTokenList = new ArrayList<>(tokenList); + cloneTokenList.remove(cloneTokenList.size() - 1); + + return cloneTokenList.stream().map(CssToken::new).collect(Collectors.toList()); + } +} -- cgit v1.2.3