aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Blott2014-05-16 13:24:38 +0100
committerStephen Blott2014-05-16 13:24:38 +0100
commit56325ab3858839a9eea61752665192c624c6e852 (patch)
treefe87355ab52c0547d55ca821313afcb9ea32534c
parenta7ac21b392708445374ce9767be2b65eaf89d103 (diff)
downloadvimium-56325ab3858839a9eea61752665192c624c6e852.tar.bz2
Import relevancy improvement code.
-rw-r--r--background_scripts/completion.coffee127
-rw-r--r--tests/unit_tests/completion_test.coffee86
2 files changed, 213 insertions, 0 deletions
diff --git a/background_scripts/completion.coffee b/background_scripts/completion.coffee
index beb7003a..2805dda9 100644
--- a/background_scripts/completion.coffee
+++ b/background_scripts/completion.coffee
@@ -296,8 +296,130 @@ RankingUtils =
return false unless matchedTerm
true
+ # TODO: Remove this long explanatory comment.
+ #
+ # The differences between the following version of `wordRelevancy` and the old one are:
+ #
+ # 1. Word relevancy scores:
+ # - Reduced by a factor of 3/1 for matches which are not at the start of a word.
+ # - Reduced by a factor of 3/2 for matches which are at the start of a word but not a whole word.
+ # - Unchangd for matches which are whole words.
+ # (These values come from the fudge factors in `matchWeights`, below)
+ #
+ # 2. Recency scores:
+ # - Reduced by a factor of 3/2.
+ # - This retains the existing balance between relevancy and recency for partial matches at the start of a word.
+ # - In the other two cases:
+ # The relative contribution of relevancy is reduced for partial matches.
+ # The relative contribution of relevancy is increased for whole-word matches.
+ #
+ # 3. Normalisation takes account of the number of times each term matched (previously, normalisation
+ # accounted only for the total length of the query terms).
+ #
+ # 4. `wordRelevancy()` now does not allow a poor `urlScore` to pull down a good `titleScore`
+ # (But see, also, the comments regarding three possible endings, below.)
+
+ # Weights used for scoring matches.
+ matchWeights:
+ {
+ matchAnywhere: 1
+ matchStartOfWord: 1
+ matchWholeWord: 1
+ # The following must be the sum of the weights above; it is used for normalization.
+ maximumScore: 3
+ #
+ # Calibration factor for balancing word relevancy and recency.
+ recencyCalibrator: 2.0/3.0
+ # The current value of 2.0/3.0 has the effect of:
+ # - favoring the contribution of recency when matches are not on word boundaries ( because 2.0/3.0 > (1)/3 )
+ # - retaining the existing balance when matches are at the starts of words ( because 2.0/3.0 = (1+1)/3 )
+ # - favoring the contribution of word relevance when matches are on whole words ( because 2.0/3.0 < (1+1+1)/3 )
+ }
+
+ # Calculate a score for matching `term` against `string`.
+ # The score is in the range [0, `matchWeights.maximumScore`], see above.
+ # Returns: [ score, count ], where count is the number of matched characters in `string`
+ scoreTerm: (term, string) ->
+ score = 0
+ count = 0
+ nonMatching = string.split(RegexpCache.get term)
+ if nonMatching.length > 1
+ # Have match.
+ score = RankingUtils.matchWeights.matchAnywhere
+ count = nonMatching.reduce(((p,c) -> p - c.length), string.length)
+ if RegexpCache.get(term, "\\b").test string
+ # Have match at start of word.
+ score += RankingUtils.matchWeights.matchStartOfWord
+ if RegexpCache.get(term, "\\b", "\\b").test string
+ # Have match of whole word.
+ score += RankingUtils.matchWeights.matchWholeWord
+ [ score, if count < string.length then count else string.length ]
+
# Returns a number between [0, 1] indicating how often the query terms appear in the url and title.
wordRelevancy: (queryTerms, url, title) ->
+ urlScore = titleScore = 0.0
+ urlCount = titleCount = 0
+ # Calculate initial scores.
+ for term in queryTerms
+ [ s, c ] = RankingUtils.scoreTerm term, url
+ urlScore += s
+ urlCount += c
+ if title
+ [ s, c ] = RankingUtils.scoreTerm term, title
+ titleScore += s
+ titleCount += c
+
+ maximumPossibleScore = RankingUtils.matchWeights.maximumScore * queryTerms.length
+
+ # Normalize scores.
+ urlScore /= maximumPossibleScore
+ urlScore *= RankingUtils.normalizeDifference urlCount, url.length
+
+ if title
+ titleScore /= maximumPossibleScore
+ titleScore *= RankingUtils.normalizeDifference titleCount, title.length
+ else
+ titleScore = urlScore
+
+ # ######################################################
+ # Up to this point, some weightings have changed, but the structure of
+ # calculation is pretty much as it was before.
+ #
+ # However, there are three possible endings to this story ...
+
+ # ######################################################
+ # Ending #1: The old ending ...
+ #
+ # return (urlScore + titleScore) / 2
+ #
+ # It's difficult to argue with that.
+
+ # ######################################################
+ # Ending #2: An ending favoring `titleScore` ...
+ #
+ # Prefer matches in the title over matches in the URL.
+ # Here, this means "don't let a poor urlScore pull down the titleScore".
+ # For example, urlScore can be unreasonably poor if the URL is very long.
+ urlScore = titleScore if urlScore < titleScore
+
+ # Return the average.
+ (urlScore + titleScore) / 2
+
+ # ######################################################
+ # Ending #3: An alternative (better?) ending ...
+ #
+ # return Math.max(urlScore, titleScore)
+ #
+ # Why?
+ # - Don't let a poor urlScore pull down a good titleScore, as in Ending #2.
+ # - But also don't let a poor titleScore pull down a good urlScore.
+ # - The query may be targeting one or the other, so let the best one win.
+
+ # Pick one of these three endings.
+ # ######################################################
+
+ # Returns a number between [0, 1] indicating how often the query terms appear in the url and title.
+ oldWordRelevancy: (queryTerms, url, title) ->
queryLength = 0
urlScore = 0.0
titleScore = 0.0
@@ -326,6 +448,11 @@ RankingUtils =
# incresingly discount older history entries.
recencyScore = recencyDifference * recencyDifference * recencyDifference
+ # Calibrate recencyScore vis-a-vis word-relevancy scores.
+ # This does not change the relative order of recency scores.
+ # See also comment in the definition of `matchWeights`, above.
+ recencyScore *= RankingUtils.matchWeights.recencyCalibrator
+
# Takes the difference of two numbers and returns a number between [0, 1] (the percentage difference).
normalizeDifference: (a, b) ->
max = Math.max(a, b)
diff --git a/tests/unit_tests/completion_test.coffee b/tests/unit_tests/completion_test.coffee
index fb267f63..e52dbbc1 100644
--- a/tests/unit_tests/completion_test.coffee
+++ b/tests/unit_tests/completion_test.coffee
@@ -228,6 +228,92 @@ context "suggestions",
suggestion = new Suggestion(["queryterm"], "tab", "http://ninjawords.com", "ninjawords", returns(1))
assert.equal -1, suggestion.generateHtml().indexOf("http://ninjawords.com")
+context "RankingUtils.wordRelevancy",
+ should "get a higher relevancy score in shorter URLs", ->
+ highScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflow.com/short", "nothing")
+ lowScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflow.com/longer", "nothing")
+ assert.isTrue highScore > lowScore
+
+ should "get a higher relevancy score in shorter titles", ->
+ highScore = RankingUtils.wordRelevancy(["ffee"], "http://stackoverflow.com/same", "Coffeescript")
+ lowScore = RankingUtils.wordRelevancy(["ffee"], "http://stackoverflow.com/same", "Coffeescript rocks")
+ assert.isTrue highScore > lowScore
+
+ should "get a higher relevancy score for matching the start of a word (in a URL)", ->
+ lowScore = RankingUtils.wordRelevancy(["stack"], "http://Xstackoverflow.com/same", "nothing")
+ highScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflowX.com/same", "nothing")
+ assert.isTrue highScore > lowScore
+
+ should "get a higher relevancy score for matching the start of a word (in a title)", ->
+ lowScore = RankingUtils.wordRelevancy(["ted"], "http://stackoverflow.com/same", "Dist racted")
+ highScore = RankingUtils.wordRelevancy(["ted"], "http://stackoverflow.com/same", "Distrac ted")
+ assert.isTrue highScore > lowScore
+
+ should "get a higher relevancy score for matching a whole word (in a URL)", ->
+ lowScore = RankingUtils.wordRelevancy(["com"], "http://stackoverflow.comX/same", "nothing")
+ highScore = RankingUtils.wordRelevancy(["com"], "http://stackoverflowX.com/same", "nothing")
+ assert.isTrue highScore > lowScore
+
+ should "get a higher relevancy score for matching a whole word (in a title)", ->
+ lowScore = RankingUtils.wordRelevancy(["com"], "http://stackoverflow.com/same", "abc comX")
+ highScore = RankingUtils.wordRelevancy(["com"], "http://stackoverflow.com/same", "abcX com")
+ assert.isTrue highScore > lowScore
+
+ # # TODO: (smblott)
+ # # Word relevancy should take into account the number of matches (it doesn't currently).
+ # should "get a higher relevancy score for multiple matches (in a URL)", ->
+ # lowScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflow.com/Xxxxxx", "nothing")
+ # highScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflow.com/Xstack", "nothing")
+ # assert.isTrue highScore > lowScore
+
+ # should "get a higher relevancy score for multiple matches (in a title)", ->
+ # lowScore = RankingUtils.wordRelevancy(["bbc"], "http://stackoverflow.com/same", "BBC Radio 4 (XBCr4)")
+ # highScore = RankingUtils.wordRelevancy(["bbc"], "http://stackoverflow.com/same", "BBC Radio 4 (BBCr4)")
+ # assert.isTrue highScore > lowScore
+
+# WARNING: The following tests are hardware dependent. They depend upon
+# different but algebraically-equivalent sequences of floating point
+# operations yielding the same results. That they ever work at all is quite
+# remarkable.
+# TODO: (smblott)
+# Remove these tests when `oldWordRelevancy` is removed.
+context "RankingUtils.wordRelevancy (temporary hardware-dependent tests)",
+ should "exactly equal oldWordRelevancy for whole word matches (in a URL)", ->
+ newScore = RankingUtils.wordRelevancy(["com"], "http://stackoverflow.com/same", "irrelevant")
+ oldScore = RankingUtils.oldWordRelevancy(["com"], "http://stackoverflow.com/same", "irrelevant")
+ assert.equal newScore, oldScore # remarkable! Exactly equal floats.
+
+ should "yield 2/3 * oldWordRelevancy for matches at the start of a word (in a URL)", ->
+ newScore = RankingUtils.wordRelevancy(["sta"], "http://stackoverflow.com/same", "irrelevant")
+ oldScore = (2.0/3.0) * RankingUtils.oldWordRelevancy(["sta"], "http://stackoverflow.com/same", "irrelevant")
+ assert.equal newScore, oldScore # remarkable! Exactly equal floats.
+
+ should "yield 1/3 * oldWordRelevancy for matches within a word (in a URL)", ->
+ newScore = RankingUtils.wordRelevancy(["over"], "http://stackoverflow.com/same", "irrelevant")
+ oldScore = (1.0/3.0) * RankingUtils.oldWordRelevancy(["over"], "http://stackoverflow.com/same", "irrelevant")
+ assert.equal newScore, oldScore # remarkable! Exactly equal floats.
+
+ should "exactly equal oldWordRelevancy for whole word matches (in a title)", ->
+ newScore = RankingUtils.wordRelevancy(["relevant"], "http://stackoverflow.com/same", "XX relevant YY")
+ # Multiply by 2 to account for new wordRelevancy favoring title.
+ oldScore = 2 * RankingUtils.oldWordRelevancy(["relevant"], "http://stackoverflow.com/same", "XX relevant YY")
+ assert.equal newScore, oldScore # remarkable! Exactly equal floats.
+
+ should "2/3 * oldWordRelevancy for matches at the start of a word (in a title)", ->
+ newScore = RankingUtils.wordRelevancy(["relev"], "http://stackoverflow.com/same", "XX relevant YY")
+ # Multiply by 2 to account for new wordRelevancy favoring title.
+ oldScore = (2.0/3.0) * 2 * RankingUtils.oldWordRelevancy(["relev"], "http://stackoverflow.com/same", "XX relevant YY")
+ assert.equal newScore, oldScore # remarkable! Exactly equal floats.
+
+ should "1/3 * oldWordRelevancy for matches within a word (in a title)", ->
+ newScore = RankingUtils.wordRelevancy(["elev"], "http://stackoverflow.com/same", "XX relevant YY")
+ # Multiply by 2 to account for new wordRelevancy favoring title.
+ oldScore = (1.0/3.0) * 2 * RankingUtils.oldWordRelevancy(["elev"], "http://stackoverflow.com/same", "XX relevant YY")
+ assert.equal newScore, oldScore # remarkable! Exactly equal floats.
+#
+# End of hardware-dependent tests.
+
+context "Suggestion.pushMatchingRanges",
should "extract ranges matching term (simple case, two matches)", ->
ranges = []
[ one, two, three ] = [ "one", "two", "three" ]