From df153ba45fffa47f1bff7a4201d5fd16fc7b3445 Mon Sep 17 00:00:00 2001 From: Amaury Levé Date: Tue, 12 Jun 2018 16:26:16 +0200 Subject: Tokenize CSS (#40) --- .../src/main/java/org/sonar/css/plugin/Token.java | 48 +++ .../main/java/org/sonar/css/plugin/Tokenizer.java | 134 ++++++++ sonar-css-plugin/src/main/resources/tokenize.js | 374 +++++++++++++++++++++ 3 files changed, 556 insertions(+) create mode 100644 sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java create mode 100644 sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java create mode 100644 sonar-css-plugin/src/main/resources/tokenize.js (limited to 'sonar-css-plugin/src/main') diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java new file mode 100644 index 0000000..dc9af61 --- /dev/null +++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Token.java @@ -0,0 +1,48 @@ +/* + * SonarCSS + * Copyright (C) 2018-2018 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.css.plugin; + +public class Token { + + public enum Type { + COMMENT, + STRING, + WORD, + AT_WORD, + BRACKETS, + PUNCTUATOR + } + + Type type; + String text; + Integer startLine; + Integer startColumn; + Integer endLine; + Integer endColumn; + + public Token(Type type, String text, Integer startLine, Integer startColumn, Integer endLine, Integer endColumn) { + this.text = text; + this.type = type; + this.startLine = startLine; + this.startColumn = startColumn; + this.endLine = endLine; + this.endColumn = endColumn; + } +} diff --git a/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java new file mode 100644 index 0000000..8f03492 --- /dev/null +++ b/sonar-css-plugin/src/main/java/org/sonar/css/plugin/Tokenizer.java @@ -0,0 +1,134 @@ +/* + * SonarCSS + * Copyright (C) 2018-2018 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.css.plugin; + +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import javax.script.ScriptException; +import org.sonar.api.internal.apachecommons.lang.StringEscapeUtils; +import org.sonar.css.plugin.Token.Type; + +public class Tokenizer { + + public List tokenize(String css) throws ScriptException { + ScriptEngineManager factory = new ScriptEngineManager(); + ScriptEngine engine = factory.getEngineByName("JavaScript"); + InputStream tokenizeScript = Tokenizer.class.getClassLoader().getResourceAsStream("tokenize.js"); + engine.eval(new InputStreamReader(tokenizeScript, StandardCharsets.UTF_8)); + String cssInput = "tokenize('" + StringEscapeUtils.escapeJavaScript(css) + "')"; + Object tokens = engine.eval(cssInput); + return extractTokens(tokens); + } + + private static List extractTokens(Object tokens) { + // tokens is result of call to javascript function tokenize(). It returns an array of arrays, where nested arrays + // correspond to tokens. These array javascript objects mapped in Java to Map objects where array index is key. + + List resultList = new ArrayList<>(); + for (Object tokenObject : ((Map) tokens).values()) { + + // Access the inner arrays (disregard the keys) and use their length to decide which type of token we are + // dealing with. + Map tokenProperties = (Map) tokenObject; + + // skip whitespace token (size < 4) + if (tokenProperties.size() >= 4) { + String text = tokenProperties.get("1").toString(); + Type type = computeType(tokenProperties.get("0").toString(), text); + Integer startLine = convertToInt(tokenProperties.get("2")); + Integer startColumn = ((Double) tokenProperties.get("3")).intValue(); + + // all cases except for punctuator type + if (tokenProperties.size() == 6) { + Integer endLine = convertToInt(tokenProperties.get("4")); + Integer endColumn = ((Double) tokenProperties.get("5")).intValue(); + + + if (isTokenWithPunctuator(text, ",", startLine, endLine)) { + resultList.addAll(splitTokenWithPunctuator(text, type, startLine, startColumn, endLine, endColumn)); + } else if (isTokenWithPunctuator(text, ":", startLine, endLine)) { + resultList.addAll(splitTokenWithPunctuator(text, type, startLine, startColumn, endLine, endColumn)); + } else { + resultList.add(new Token(type, text, startLine, startColumn, endLine, endColumn)); + } + } else { + // is punctuator + resultList.add(new Token(type, text, startLine, startColumn, startLine, startColumn)); + } + } + } + + return resultList; + } + + // Javascript tokenizer is not returning 2 tokens for words ending with a comma (e.g. foo,) and for words starting + // with at symbol and endings with colon (e.g. @base:) so we need to split the word into 2 tokens (1 word without + // the punctuator and 1 punctuator). + // For the sake of simplicity we don't handle words ending with the punctuator on a new line. + private static Boolean isTokenWithPunctuator(String text, String punctuator, Integer startLine, Integer endLine) { + return text.length() > 1 && text.endsWith(punctuator) && startLine.equals(endLine); + } + + private static List splitTokenWithPunctuator(String text, Type type, Integer startLine, Integer startColumn, Integer endLine, Integer endColumn) { + List tokenList = new ArrayList<>(); + + tokenList.add(new Token(type, text.substring(0, text.length() - 1), startLine, startColumn, endLine, endColumn - 1)); + tokenList.add(new Token(Type.PUNCTUATOR, text.substring(text.length() - 1), startLine, endColumn, endLine, endColumn)); + + return tokenList; + } + + private static Integer convertToInt(Object value) { + if (value instanceof Double) { + return ((Double) value).intValue(); + } else if (value instanceof Integer) { + return (Integer) value; + } else { + throw new IllegalStateException("Failed to convert to number: " + value); + } + } + + private static Type computeType(String type, String text) { + switch (type) { + case "at-word": + return Type.AT_WORD; + case "word": + if (",".equals(text)) { + return Type.PUNCTUATOR; + } else { + return Type.WORD; + } + case "comment": + return Type.COMMENT; + case "string": + return Type.STRING; + case "brackets": + return Type.BRACKETS; + default: + return Type.PUNCTUATOR; + } + } +} diff --git a/sonar-css-plugin/src/main/resources/tokenize.js b/sonar-css-plugin/src/main/resources/tokenize.js new file mode 100644 index 0000000..bbf9b1b --- /dev/null +++ b/sonar-css-plugin/src/main/resources/tokenize.js @@ -0,0 +1,374 @@ +/* + * SonarCSS + * Copyright (C) 2018-2018 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +/* + * The MIT License (MIT) + * + * Copyright 2013 Andrey Sitnik + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +var SINGLE_QUOTE = '\''.charCodeAt(0); +var DOUBLE_QUOTE = '"'.charCodeAt(0); +var BACKSLASH = '\\'.charCodeAt(0); +var SLASH = '/'.charCodeAt(0); +var NEWLINE = '\n'.charCodeAt(0); +var SPACE = ' '.charCodeAt(0); +var FEED = '\f'.charCodeAt(0); +var TAB = '\t'.charCodeAt(0); +var CR = '\r'.charCodeAt(0); +var OPEN_SQUARE = '['.charCodeAt(0); +var CLOSE_SQUARE = ']'.charCodeAt(0); +var OPEN_PARENTHESES = '('.charCodeAt(0); +var CLOSE_PARENTHESES = ')'.charCodeAt(0); +var OPEN_CURLY = '{'.charCodeAt(0); +var CLOSE_CURLY = '}'.charCodeAt(0); +var SEMICOLON = ';'.charCodeAt(0); +var ASTERISK = '*'.charCodeAt(0); +var COLON = ':'.charCodeAt(0); +var AT = '@'.charCodeAt(0); + +var RE_AT_END = /[ \n\t\r\f\{\(\)'"\\;/\[\]#]/g; +var RE_WORD_END = /[ \n\t\r\f\(\)\{\}:;@!'"\\\]\[#]|\/(?=\*)/g; +var RE_BAD_BRACKET = /.[\\\/\("'\n]/; +var RE_HEX_ESCAPE = /[a-f0-9]/i; + +function tokenize(css) { + var input = {css: css, error: function() {}}; + var processor = tokenizer(input); + var tokens = []; + while (!processor.endOfFile()) { + tokens.push(processor.nextToken()); + } + return tokens; +} + +function tokenizer(input ) { + var options = {}; + var css = input.css.valueOf(); + var ignore = options.ignoreErrors; + + var code, next, quote, lines, last, content, escape, + nextLine, nextOffset, escaped, escapePos, prev, n, currentToken; + + var length = css.length; + var offset = -1; + var line = 1; + var pos = 0; + var buffer = []; + var returned = []; + + function unclosed(what) { + throw input.error('Unclosed ' + what, line, pos - offset); + } + + function endOfFile() { + return returned.length === 0 && pos >= length; + } + + function nextToken() { + if ( returned.length ) return returned.pop(); + if ( pos >= length ) return; + + code = css.charCodeAt(pos); + if ( code === NEWLINE || code === FEED || + code === CR && css.charCodeAt(pos + 1) !== NEWLINE ) { + offset = pos; + line += 1; + } + + switch ( code ) { + case NEWLINE: + case SPACE: + case TAB: + case CR: + case FEED: + next = pos; + do { + next += 1; + code = css.charCodeAt(next); + if ( code === NEWLINE ) { + offset = next; + line += 1; + } + } while ( code === SPACE || + code === NEWLINE || + code === TAB || + code === CR || + code === FEED ); + + currentToken = ['space', css.slice(pos, next)]; + pos = next - 1; + break; + + case OPEN_SQUARE: + currentToken = ['[', '[', line, pos - offset]; + break; + + case CLOSE_SQUARE: + currentToken = [']', ']', line, pos - offset]; + break; + + case OPEN_CURLY: + currentToken = ['{', '{', line, pos - offset]; + break; + + case CLOSE_CURLY: + currentToken = ['}', '}', line, pos - offset]; + break; + + case COLON: + currentToken = [':', ':', line, pos - offset]; + break; + + case SEMICOLON: + currentToken = [';', ';', line, pos - offset]; + break; + + case OPEN_PARENTHESES: + prev = buffer.length ? buffer.pop()[1] : ''; + n = css.charCodeAt(pos + 1); + if ( prev === 'url' && + n !== SINGLE_QUOTE && n !== DOUBLE_QUOTE && + n !== SPACE && n !== NEWLINE && n !== TAB && + n !== FEED && n !== CR ) { + next = pos; + do { + escaped = false; + next = css.indexOf(')', next + 1); + if ( next === -1 ) { + if ( ignore ) { + next = pos; + break; + } else { + unclosed('bracket'); + } + } + escapePos = next; + while ( css.charCodeAt(escapePos - 1) === BACKSLASH ) { + escapePos -= 1; + escaped = !escaped; + } + } while ( escaped ); + + currentToken = ['brackets', css.slice(pos, next + 1), + line, pos - offset, + line, next - offset + ]; + + pos = next; + + } else { + next = css.indexOf(')', pos + 1); + content = css.slice(pos, next + 1); + + if ( next === -1 || RE_BAD_BRACKET.test(content) ) { + currentToken = ['(', '(', line, pos - offset]; + } else { + currentToken = ['brackets', content, + line, pos - offset, + line, next - offset + ]; + pos = next; + } + } + + break; + + case CLOSE_PARENTHESES: + currentToken = [')', ')', line, pos - offset]; + break; + + case SINGLE_QUOTE: + case DOUBLE_QUOTE: + quote = code === SINGLE_QUOTE ? '\'' : '"'; + next = pos; + do { + escaped = false; + next = css.indexOf(quote, next + 1); + if ( next === -1 ) { + if ( ignore ) { + next = pos + 1; + break; + } else { + unclosed('string'); + } + } + escapePos = next; + while ( css.charCodeAt(escapePos - 1) === BACKSLASH ) { + escapePos -= 1; + escaped = !escaped; + } + } while ( escaped ); + + content = css.slice(pos, next + 1); + lines = content.split('\n'); + last = lines.length - 1; + + if ( last > 0 ) { + nextLine = line + last; + nextOffset = next - lines[last].length; + } else { + nextLine = line; + nextOffset = offset; + } + + currentToken = ['string', css.slice(pos, next + 1), + line, pos - offset, + nextLine, next - nextOffset + ]; + + offset = nextOffset; + line = nextLine; + pos = next; + break; + + case AT: + RE_AT_END.lastIndex = pos + 1; + RE_AT_END.test(css); + if ( RE_AT_END.lastIndex === 0 ) { + next = css.length - 1; + } else { + next = RE_AT_END.lastIndex - 2; + } + + currentToken = ['at-word', css.slice(pos, next + 1), + line, pos - offset, + line, next - offset + ]; + + pos = next; + break; + + case BACKSLASH: + next = pos; + escape = true; + while ( css.charCodeAt(next + 1) === BACKSLASH ) { + next += 1; + escape = !escape; + } + code = css.charCodeAt(next + 1); + if ( escape && (code !== SLASH && + code !== SPACE && + code !== NEWLINE && + code !== TAB && + code !== CR && + code !== FEED ) ) { + next += 1; + if ( RE_HEX_ESCAPE.test(css.charAt(next)) ) { + while ( RE_HEX_ESCAPE.test(css.charAt(next + 1)) ) { + next += 1; + } + if ( css.charCodeAt(next + 1) === SPACE ) { + next += 1; + } + } + } + + currentToken = ['word', css.slice(pos, next + 1), + line, pos - offset, + line, next - offset + ]; + + pos = next; + break; + + default: + if ( code === SLASH && css.charCodeAt(pos + 1) === ASTERISK ) { + next = css.indexOf('*/', pos + 2) + 1; + if ( next === 0 ) { + if ( ignore ) { + next = css.length; + } else { + unclosed('comment'); + } + } + + content = css.slice(pos, next + 1); + lines = content.split('\n'); + last = lines.length - 1; + + if ( last > 0 ) { + nextLine = line + last; + nextOffset = next - lines[last].length; + } else { + nextLine = line; + nextOffset = offset; + } + + currentToken = ['comment', content, + line, pos - offset, + nextLine, next - nextOffset + ]; + + offset = nextOffset; + line = nextLine; + pos = next; + + } else { + RE_WORD_END.lastIndex = pos + 1; + RE_WORD_END.test(css); + if ( RE_WORD_END.lastIndex === 0 ) { + next = css.length - 1; + } else { + next = RE_WORD_END.lastIndex - 2; + } + + currentToken = ['word', css.slice(pos, next + 1), + line, pos - offset, + line, next - offset + ]; + + buffer.push(currentToken); + + pos = next; + } + + break; + } + + pos++; + return currentToken; + } + + function back(token) { + returned.push(token); + } + + return { + back:back, nextToken:nextToken, endOfFile:endOfFile + }; +} -- cgit v1.2.3