From 16f978cab6542e7aaa932e1af87b9eec5c214b14 Mon Sep 17 00:00:00 2001 From: Stephen Blott Date: Tue, 12 May 2015 10:11:08 +0100 Subject: Simplify and filter vomnibar URLs. - Remove various bits of URL fluff (the scheme, trailing URL separators). - Remove various unhelpful Google search parameters. - Filter for duplicates (based on the simplified URL). --- background_scripts/completion.coffee | 45 +++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'background_scripts') diff --git a/background_scripts/completion.coffee b/background_scripts/completion.coffee index 85829c75..3251f58a 100644 --- a/background_scripts/completion.coffee +++ b/background_scripts/completion.coffee @@ -41,6 +41,7 @@ class Suggestion # or @relevancyFunction. @relevancy ?= @relevancyFunction this + # Note. This always returns a truthy value. generateHtml: -> return @html if @html relevancyHtml = if @showRelevancy then "#{@computeRelevancy()}" else "" @@ -52,7 +53,7 @@ class Suggestion #{@highlightQueryTerms Utils.escapeHtml @title}
- #{@shortenUrl @highlightQueryTerms Utils.escapeHtml @url} + #{@highlightQueryTerms Utils.escapeHtml @shortenUrl()} #{relevancyHtml}
""" @@ -68,8 +69,6 @@ class Suggestion a.href = url a.hostname - shortenUrl: (url) -> @stripTrailingSlash(url).replace(/^https?:\/\//, "") - stripTrailingSlash: (url) -> url = url.substring(url, url.length - 1) if url[url.length - 1] == "/" url @@ -129,6 +128,28 @@ class Suggestion previous = range mergedRanges + # Simplify a suggestion's URL (by removing those parts which aren't useful for either display or comparison). + shortenUrl: () -> + return @shortUrl if @shortUrl? + url = @url + for [ filter, replacements ] in @stripPatterns + if new RegExp(filter).test url + for replace in replacements + url = url.replace replace, "" + @shortUrl = url + + # Patterns to strip from URLs; of the form [ [ filter, replacements ], [ filter, replacements ], ... ] + # - filter is a regexp; a URL must match this regexp first. + # - replacements (itself a list) is a list of regexps, each of which is removed from matching URLs. + # + stripPatterns: [ + # Google search specific replacements; replaces query parameters which are known to not be helpful. + [ '^https?://www\.google\.(com|ca|com\.au|co\.uk|ie)/.*[&?]q=', + "ei gws_rd url ved usg sa usg sig2".split(/\s+/).map (param) -> new RegExp "\&#{param}=[^&]+" ] + + # General replacements; replaces leading and trailing fluff. + [ '.', [ "^https?://", "\\W+$" ].map (re) -> new RegExp re ] + ] class BookmarkCompleter folderSeparator: "/" @@ -575,11 +596,23 @@ class MultiCompleter @filter @mostRecentQuery... prepareSuggestions: (queryTerms, suggestions) -> + # Compute suggestion relevancies and sort. suggestion.computeRelevancy queryTerms for suggestion in suggestions suggestions.sort (a, b) -> b.relevancy - a.relevancy - for suggestion in suggestions[0...@maxResults] - suggestion.generateHtml() - suggestion + + # Simplify URLs and remove duplicates (duplicate simplified URLs, that is). + count = 0 + seenUrls = {} + suggestions = + for suggestion in suggestions + url = suggestion.shortenUrl() + continue if seenUrls[url] + break if ++count == @maxResults + seenUrls[url] = suggestion + + # Generate HTML for the remaining suggestions and return them. + suggestion.generateHtml() for suggestion in suggestions + suggestions # Utilities which help us compute a relevancy score for a given item. RankingUtils = -- cgit v1.2.3