diff options
| author | Niklas Baumstark | 2012-01-16 17:03:21 +0100 |
|---|---|---|
| committer | Jez Ng | 2012-01-17 03:22:48 +0800 |
| commit | 4e3ea20ff2b78b25a6e00e4d538939b07a40b644 (patch) | |
| tree | 83f1b789ec53c42006e921c590c18a47652282a6 /lib/utils.js | |
| parent | 73f8a90bff353e6b3c1e61c0d339d95f2c349cd4 (diff) | |
| download | vimium-4e3ea20ff2b78b25a6e00e4d538939b07a40b644.tar.bz2 | |
improve URL detection
Diffstat (limited to 'lib/utils.js')
| -rw-r--r-- | lib/utils.js | 74 |
1 files changed, 66 insertions, 8 deletions
diff --git a/lib/utils.js b/lib/utils.js index a0668409..12304be2 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -76,22 +76,80 @@ var utils = { }, /** + * Creates a search URL from the given :query. + */ + createSearchUrl: function(query) { + return "http://www.google.com/search?q=" + query; + }, + + /** * Tries to convert :str into a valid URL. * We don't bother with escaping characters, however, as Chrome will do that for us. */ ensureUrl: function(str) { + // more or less RFC compliant URL host part parsing. This should be sufficient + // for our needs + var urlRegex = new RegExp( + '^(?:([^:]+)(?::([^:]+))?@)?' + // user:password (optional) => \1, \2 + '([^:]+|\\[[^\\]]+\\])' + // host name (IPv6 addresses in square brackets allowed) => \3 + '(?::(\\d+))?$' // port number (optional) => \4 + ); + + // these are all official ASCII TLDs that are longer than 3 characters + // (including the inofficial .onion TLD used by TOR) + var longTlds = [ + 'arpa', + 'asia', + 'coop', + 'info', + 'jobs', + 'local', + 'mobi', + 'museum', + 'name', + 'onion', + ]; + + // are there more? + var specialHostNames = [ 'localhost' ]; + // trim str str = str.replace(/^\s+|\s+$/g, ''); - // definitely not a valid URL; treat as a search query - if (str.indexOf(" ") != -1 || (str.indexOf('.') == -1 && !/^((http|https|ftp):\/\/)?localhost/.test(str))) - return "http://www.google.com/search?q=" + str; - // possibly a valid URL, but not canonical - else if (!/^(http|https|ftp|chrome):\/\//.test(str)) - return "http://" + str; - // cross our fingers and hope it is valid - else + // it starts with a scheme, so it's definitely an URL + if (/^[a-z]{3,}:\/\//.test(str)) return str; + var strWithScheme = 'http://' + str; + + // definitely not a valid URL; treat as search query + if (str.indexOf(' ') >= 0) + return utils.createSearchUrl(str); + + // assuming that this is an URL, try to parse it into its meaningful parts. If matching fails, we're + // pretty sure that we don't have some kind of URL here. + var match = urlRegex.exec(str.split('/')[0]); + if (!match) + return utils.createSearchUrl(str); + var hostname = match[3]; + + // allow known special host names + if (specialHostNames.indexOf(hostname) >= 0) + return strWithScheme; + + // allow IPv6 addresses (need to be wrapped in brackets, as required by RFC). It is sufficient to check + // for a colon here, as the regex wouldn't match colons in the host name unless it's an v6 address + if (hostname.indexOf(':') >= 0) + return strWithScheme; + + // at this point we have to make a decision. As a heuristic, we check if the input has dots in it. If + // yes, and if the last part could be a TLD, treat it as an URL + var dottedParts = hostname.split('.'); + var lastPart = dottedParts[dottedParts.length-1]; + if (dottedParts.length > 1 && (lastPart.length <= 3 || longTlds.indexOf(lastPart) >= 0)) + return strWithScheme; + + // fallback: use as search query + return utils.createSearchUrl(str); }, }; |
