improve URL detection

author: Niklas Baumstark 2012-01-16 17:03:21 +0100
committer: Jez Ng 2012-01-17 03:22:48 +0800
commit: 4e3ea20ff2b78b25a6e00e4d538939b07a40b644 (patch)
tree: 83f1b789ec53c42006e921c590c18a47652282a6 /lib/utils.js
parent: 73f8a90bff353e6b3c1e61c0d339d95f2c349cd4 (diff)
download: vimium-4e3ea20ff2b78b25a6e00e4d538939b07a40b644.tar.bz2
1 files changed, 66 insertions, 8 deletions
diff --git a/lib/utils.js b/lib/utils.js
index a0668409..12304be2 100644
--- a/lib/utils.js
+++ b/lib/utils.js
@@ -76,22 +76,80 @@ var utils = {
   },
 
   /**
+   * Creates a search URL from the given :query.
+   */
+  createSearchUrl: function(query) {
+    return "http://www.google.com/search?q=" + query;
+  },
+
+  /**
    * Tries to convert :str into a valid URL.
    * We don't bother with escaping characters, however, as Chrome will do that for us.
    */
   ensureUrl: function(str) {
+    // more or less RFC compliant URL host part parsing. This should be sufficient
+    // for our needs
+    var urlRegex = new RegExp(
+      '^(?:([^:]+)(?::([^:]+))?@)?' +   // user:password (optional)     => \1, \2
+      '([^:]+|\\[[^\\]]+\\])'       +   // host name (IPv6 addresses in square brackets allowed) => \3
+      '(?::(\\d+))?$'                   // port number (optional)       => \4
+      );
+
+    // these are all official ASCII TLDs that are longer than 3 characters
+    // (including the inofficial .onion TLD used by TOR)
+    var longTlds = [
+      'arpa',
+      'asia',
+      'coop',
+      'info',
+      'jobs',
+      'local',
+      'mobi',
+      'museum',
+      'name',
+      'onion',
+    ];
+
+    // are there more?
+    var specialHostNames = [ 'localhost' ];
+
     // trim str
     str = str.replace(/^\s+|\s+$/g, '');
 
-    // definitely not a valid URL; treat as a search query
-    if (str.indexOf(" ") != -1 || (str.indexOf('.') == -1 && !/^((http|https|ftp):\/\/)?localhost/.test(str)))
-      return "http://www.google.com/search?q=" + str;
-    // possibly a valid URL, but not canonical
-    else if (!/^(http|https|ftp|chrome):\/\//.test(str))
-      return "http://" + str;
-    // cross our fingers and hope it is valid
-    else
+    // it starts with a scheme, so it's definitely an URL
+    if (/^[a-z]{3,}:\/\//.test(str))
       return str;
+    var strWithScheme = 'http://' + str;
+
+    // definitely not a valid URL; treat as search query
+    if (str.indexOf(' ') >= 0)
+      return utils.createSearchUrl(str);
+
+    // assuming that this is an URL, try to parse it into its meaningful parts. If matching fails, we're
+    // pretty sure that we don't have some kind of URL here.
+    var match = urlRegex.exec(str.split('/')[0]);
+    if (!match)
+      return utils.createSearchUrl(str);
+    var hostname = match[3];
+
+    // allow known special host names
+    if (specialHostNames.indexOf(hostname) >= 0)
+      return strWithScheme;
+
+    // allow IPv6 addresses (need to be wrapped in brackets, as required by RFC).  It is sufficient to check
+    // for a colon here, as the regex wouldn't match colons in the host name unless it's an v6 address
+    if (hostname.indexOf(':') >= 0)
+      return strWithScheme;
+
+    // at this point we have to make a decision. As a heuristic, we check if the input has dots in it. If
+    // yes, and if the last part could be a TLD, treat it as an URL
+    var dottedParts = hostname.split('.');
+    var lastPart = dottedParts[dottedParts.length-1];
+    if (dottedParts.length > 1 && (lastPart.length <= 3 || longTlds.indexOf(lastPart) >= 0))
+      return strWithScheme;
+
+    // fallback: use as search query
+    return utils.createSearchUrl(str);
   },
 
 };
author	Niklas Baumstark	2012-01-16 17:03:21 +0100
committer	Jez Ng	2012-01-17 03:22:48 +0800
commit	4e3ea20ff2b78b25a6e00e4d538939b07a40b644 (patch)
tree	83f1b789ec53c42006e921c590c18a47652282a6 /lib/utils.js
parent	73f8a90bff353e6b3c1e61c0d339d95f2c349cd4 (diff)
download	vimium-4e3ea20ff2b78b25a6e00e4d538939b07a40b644.tar.bz2