diff options
| -rw-r--r-- | background_scripts/completion.coffee | 78 | ||||
| -rw-r--r-- | tests/unit_tests/completion_test.coffee | 44 |
2 files changed, 112 insertions, 10 deletions
diff --git a/background_scripts/completion.coffee b/background_scripts/completion.coffee index beb7003a..0b2e8b0b 100644 --- a/background_scripts/completion.coffee +++ b/background_scripts/completion.coffee @@ -296,24 +296,79 @@ RankingUtils = return false unless matchedTerm true + # Weights used for scoring matches. + matchWeights: + matchAnywhere: 1 + matchStartOfWord: 1 + matchWholeWord: 1 + # The following must be the sum of the three weights above; it is used for normalization. + maximumScore: 3 + # + # Calibration factor for balancing word relevancy and recency. + recencyCalibrator: 2.0/3.0 + # The current value of 2.0/3.0 has the effect of: + # - favoring the contribution of recency when matches are not on word boundaries ( because 2.0/3.0 > (1)/3 ) + # - favoring the contribution of word relevance when matches are on whole words ( because 2.0/3.0 < (1+1+1)/3 ) + + # Calculate a score for matching term against string. + # The score is in the range [0, matchWeights.maximumScore], see above. + # Returns: [ score, count ], where count is the number of matched characters in string. + scoreTerm: (term, string) -> + score = 0 + count = 0 + nonMatching = string.split(RegexpCache.get term) + if nonMatching.length > 1 + # Have match. + score = RankingUtils.matchWeights.matchAnywhere + count = nonMatching.reduce(((p,c) -> p - c.length), string.length) + if RegexpCache.get(term, "\\b").test string + # Have match at start of word. + score += RankingUtils.matchWeights.matchStartOfWord + if RegexpCache.get(term, "\\b", "\\b").test string + # Have match of whole word. + score += RankingUtils.matchWeights.matchWholeWord + [ score, if count < string.length then count else string.length ] + # Returns a number between [0, 1] indicating how often the query terms appear in the url and title. wordRelevancy: (queryTerms, url, title) -> - queryLength = 0 - urlScore = 0.0 - titleScore = 0.0 + urlScore = titleScore = 0.0 + urlCount = titleCount = 0 + # Calculate initial scores. for term in queryTerms - queryLength += term.length - urlScore += 1 if url && RankingUtils.matches [term], url - titleScore += 1 if title && RankingUtils.matches [term], title - urlScore = urlScore / queryTerms.length - urlScore = urlScore * RankingUtils.normalizeDifference(queryLength, url.length) + [ s, c ] = RankingUtils.scoreTerm term, url + urlScore += s + urlCount += c + if title + [ s, c ] = RankingUtils.scoreTerm term, title + titleScore += s + titleCount += c + + maximumPossibleScore = RankingUtils.matchWeights.maximumScore * queryTerms.length + + # Normalize scores. + urlScore /= maximumPossibleScore + urlScore *= RankingUtils.normalizeDifference urlCount, url.length + if title - titleScore = titleScore / queryTerms.length - titleScore = titleScore * RankingUtils.normalizeDifference(queryLength, title.length) + titleScore /= maximumPossibleScore + titleScore *= RankingUtils.normalizeDifference titleCount, title.length else titleScore = urlScore + + # Prefer matches in the title over matches in the URL. + # In other words, don't let a poor urlScore pull down the titleScore. + # For example, urlScore can be unreasonably poor if the URL is very long. + urlScore = titleScore if urlScore < titleScore + + # Return the average. (urlScore + titleScore) / 2 + # Untested alternative to the above: + # - Don't let a poor urlScore pull down a good titleScore, and don't let a poor titleScore pull down a + # good urlScore. + # + # return Math.max(urlScore, titleScore) + # Returns a score between [0, 1] which indicates how recent the given timestamp is. Items which are over # a month old are counted as 0. This range is quadratic, so an item from one day ago has a much stronger # score than an item from two days ago. @@ -326,6 +381,9 @@ RankingUtils = # incresingly discount older history entries. recencyScore = recencyDifference * recencyDifference * recencyDifference + # Calibrate recencyScore vis-a-vis word-relevancy scores. + recencyScore *= RankingUtils.matchWeights.recencyCalibrator + # Takes the difference of two numbers and returns a number between [0, 1] (the percentage difference). normalizeDifference: (a, b) -> max = Math.max(a, b) diff --git a/tests/unit_tests/completion_test.coffee b/tests/unit_tests/completion_test.coffee index fb267f63..bba0a0f8 100644 --- a/tests/unit_tests/completion_test.coffee +++ b/tests/unit_tests/completion_test.coffee @@ -228,6 +228,50 @@ context "suggestions", suggestion = new Suggestion(["queryterm"], "tab", "http://ninjawords.com", "ninjawords", returns(1)) assert.equal -1, suggestion.generateHtml().indexOf("http://ninjawords.com") +context "RankingUtils.wordRelevancy", + should "score higher in shorter URLs", -> + highScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflow.com/short", "a-title") + lowScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflow.com/longer", "a-title") + assert.isTrue highScore > lowScore + + should "score higher in shorter titles", -> + highScore = RankingUtils.wordRelevancy(["coffee"], "a-url", "Coffeescript") + lowScore = RankingUtils.wordRelevancy(["coffee"], "a-url", "Coffeescript rocks") + assert.isTrue highScore > lowScore + + should "score higher for matching the start of a word (in a URL)", -> + lowScore = RankingUtils.wordRelevancy(["stack"], "http://Xstackoverflow.com/same", "a-title") + highScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflowX.com/same", "a-title") + assert.isTrue highScore > lowScore + + should "score higher for matching the start of a word (in a title)", -> + lowScore = RankingUtils.wordRelevancy(["te"], "a-url", "Dist racted") + highScore = RankingUtils.wordRelevancy(["te"], "a-url", "Distrac ted") + assert.isTrue highScore > lowScore + + should "score higher for matching a whole word (in a URL)", -> + lowScore = RankingUtils.wordRelevancy(["com"], "http://stackoverflow.comX/same", "a-title") + highScore = RankingUtils.wordRelevancy(["com"], "http://stackoverflowX.com/same", "a-title") + assert.isTrue highScore > lowScore + + should "score higher for matching a whole word (in a title)", -> + lowScore = RankingUtils.wordRelevancy(["com"], "a-url", "abc comX") + highScore = RankingUtils.wordRelevancy(["com"], "a-url", "abcX com") + assert.isTrue highScore > lowScore + + # # TODO: (smblott) + # # Word relevancy should take into account the number of matches (it doesn't currently). + # should "score higher for multiple matches (in a URL)", -> + # lowScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflow.com/Xxxxxx", "a-title") + # highScore = RankingUtils.wordRelevancy(["stack"], "http://stackoverflow.com/Xstack", "a-title") + # assert.isTrue highScore > lowScore + + # should "score higher for multiple matches (in a title)", -> + # lowScore = RankingUtils.wordRelevancy(["bbc"], "http://stackoverflow.com/same", "BBC Radio 4 (XBCr4)") + # highScore = RankingUtils.wordRelevancy(["bbc"], "http://stackoverflow.com/same", "BBC Radio 4 (BBCr4)") + # assert.isTrue highScore > lowScore + +context "Suggestion.pushMatchingRanges", should "extract ranges matching term (simple case, two matches)", -> ranges = [] [ one, two, three ] = [ "one", "two", "three" ] |
