Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: lib/typoRules.js

Issue 8559070: Integrated URL Fixer into Adblock Plus (Closed)
Patch Set: Created Oct. 12, 2012, 2:18 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « lib/typoNetError.js ('k') | lib/typoSurvey.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: lib/typoRules.js
===================================================================
new file mode 100644
--- /dev/null
+++ b/lib/typoRules.js
@@ -0,0 +1,409 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+Cu.import("resource://gre/modules/Services.jsm");
+Cu.import("resource://gre/modules/FileUtils.jsm");
+
+let {Prefs} = require("prefs");
+
+let RULES_VERSION = 2;
+
+let CUSTOM_RULE_PRIORITY = 0x7FFFFFFF;
+
+let rules = {expressions: []};
+
+loadRules();
+
+// Make first attempt to update rules after five minutes
+let updateTimer = null;
+updateTimer = Cc["@mozilla.org/timer;1"].createInstance(Ci.nsITimer);
+updateTimer.initWithCallback(onTimer, 1000 * 60 * 5, Ci.nsITimer.TYPE_REPEATING_SLACK);
+onShutdown.add(function() updateTimer.cancel());
+
+function loadRules()
+{
+ loadRulesFrom(Services.io.newFileURI(getRuleFile()).spec, false, function(success)
+ {
+ if (!success)
+ loadRulesFrom(require("info").addonRoot + "defaults/typoRules.json", true);
+ });
+}
+
+function loadRulesFrom(url, ignoreVersion, callback)
+{
+ if (typeof callback != "function")
+ callback = function() {};
+
+ let request = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(Ci.nsIXMLHttpRequest);
+ request.open("GET", url);
+ request.overrideMimeType("text/plain");
+ request.addEventListener("load", function()
+ {
+ try
+ {
+ // Remove comments from the file if any
+ let data = JSON.parse(request.responseText.replace(/^\s*\/\/.*/mg, ""));
+ if (ignoreVersion || data.version == RULES_VERSION)
+ {
+ rules = data;
+ callback(true);
+
+ // Add user-defined rules after calling the callback - if the callback
+ // saves the rules then the custom rules won't be included.
+ addCustomRules();
+ }
+ else
+ callback(false);
+ }
+ catch (e)
+ {
+ Cu.reportError(e);
+ callback(false);
+ }
+ }, false);
+ request.addEventListener("error", function()
+ {
+ callback(false);
+ }, false);
+
+ try
+ {
+ request.send(null);
+ }
+ catch (e)
+ {
+ if (e.result != Cr.NS_ERROR_FILE_NOT_FOUND)
+ Cu.reportError(e);
+ callback(false);
+ }
+}
+
+function getRuleFile()
+{
+ let result = FileUtils.getFile("ProfD", ["url-fixer-rules.json"]);
+
+ getRuleFile = function() result;
+ return getRuleFile();
+}
+
+function addCustomRules()
+{
+ for (let domain in Prefs.whitelist)
+ onWhitelistEntryAdded(domain);
+}
+
+function onWhitelistEntryAdded(domain)
+{
+ let reverse = domain.split("").reverse().join("");
+ addSuffix(rules.domain, reverse, CUSTOM_RULE_PRIORITY);
+}
+exports.onWhitelistEntryAdded = onWhitelistEntryAdded;
+
+function onWhitelistEntryRemoved(domain)
+{
+ let reverse = domain.split("").reverse().join("");
+ removeSuffix(rules.domain, reverse, CUSTOM_RULE_PRIORITY);
+}
+exports.onWhitelistEntryRemoved = onWhitelistEntryRemoved;
+
+function addSuffix(tree, suffix, priority)
+{
+ if (suffix.length == 0)
+ {
+ // We are at the last character, just put our priority here
+ tree[""] = " " + priority;
+ return;
+ }
+
+ let c = suffix[0];
+ if (c in tree)
+ {
+ let existing = tree[c];
+ if (typeof existing == "string")
+ {
+ // Single choice for this suffix, maybe the same entry?
+ if (existing.substr(0, suffix.length - 1) == suffix.substr(1) && existing[suffix.length - 1] == " ")
+ {
+ // Same entry, simply replace it by new priority
+ tree[c] = suffix.substr(1) + " " + priority;
+ }
+ else
+ {
+ // Different entry, need to add a new branching point and go deeper
+ if (existing[0] == " ")
+ tree[c] = {"": existing};
+ else
+ {
+ tree[c] = {};
+ tree[c][existing[0]] = existing.substr(1);
+ }
+ addSuffix(tree[c], suffix.substr(1), priority);
+ }
+ }
+ else
+ {
+ // Multiple choices for this suffix - go deeper
+ addSuffix(existing, suffix.substr(1), priority);
+ }
+ }
+ else
+ {
+ // No existing entry yet, just add ours
+ tree[c] = suffix.substr(1) + " " + priority;
+ }
+}
+
+function removeSuffix(tree, suffix, priority)
+{
+ if (suffix.length == 0)
+ {
+ // We are at the last character, check whether there is an entry with
+ // matching priority
+ if ("" in tree && tree[""] == " " + priority)
+ delete tree[""];
+ return;
+ }
+
+ let c = suffix[0];
+ if (!(c in tree))
+ return;
+
+ if (typeof tree[c] == "string")
+ {
+ // Single entry - check whether it is the right one
+ if (tree[c] == suffix.substr(1) + " " + priority)
+ delete tree[c];
+ }
+ else
+ {
+ // Multiple entries, need to go deeper
+ removeSuffix(tree[c], suffix.substr(1), priority);
+ }
+}
+
+function onTimer()
+{
+ // Next check in 1 hour
+ updateTimer.delay = 1000 * 60 * 60;
+
+ // Only download rules every three days
+ let nextUpdate = Prefs.lastRuleUpdate + 60 * 60 * 24 * 3;
+ if (nextUpdate > Date.now() / 1000)
+ return;
+
+ loadRulesFrom("http://urlfixer.org/download/rules.json?version=" + RULES_VERSION, false, function(success)
+ {
+ if (success)
+ {
+ rules.timestamp = Date.now();
+
+ try
+ {
+ // Save the rules to file.
+ let rulesText = JSON.stringify(rules);
+ let fileStream = FileUtils.openSafeFileOutputStream(getRuleFile());
+ let stream = Cc["@mozilla.org/intl/converter-output-stream;1"].createInstance(Ci.nsIConverterOutputStream);
+ stream.init(fileStream, "UTF-8", 16384, Ci.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
+ stream.writeString(rulesText);
+ stream.flush();
+ FileUtils.closeSafeFileOutputStream(fileStream);
+ }
+ catch(e)
+ {
+ Cu.reportError(e);
+ }
+ }
+ });
+
+ Prefs.lastRuleUpdate = Date.now() / 1000;
+}
+
+exports.getSchemeCorrection = getSchemeCorrection;
+function getSchemeCorrection(scheme)
+{
+ return getBestMatch(scheme, rules.scheme, 1, null);
+}
+
+exports.isKnownScheme = isKnownScheme;
+function isKnownScheme(scheme)
+{
+ return (getSimilarStrings(scheme, rules.scheme, 0, null).length > 0);
+}
+
+exports.getDomainCorrection = getDomainCorrection;
+function getDomainCorrection(domain)
+{
+ // Apply user's custom changes first
+ let customRules = Prefs.custom_replace;
+ for (let searchString in customRules)
+ {
+ let replacement = customRules[searchString];
+ searchString = searchString.toLowerCase();
+ if (/^re:+/.test(searchString))
+ domain = domain.replace(new RegExp(RegExp.rightContext, "g"), replacement);
+ else
+ domain = domain.replace(searchString, replacement);
+ }
+
+ // Now apply our rules on the domain name
+ for (let i = 0, l = rules.expressions.length; i < l; i++)
+ domain = applyExpression(domain, rules.expressions[i]);
+
+ // Find similar known domains, test domains without the www. prefix
+ if (domain.substr(0, 4) == "www.")
+ domain = "www." + getBestMatch(domain.substr(4), rules.domain, 1, ".");
+ else
+ domain = getBestMatch(domain, rules.domain, 1, ".");
+
+ return domain;
+}
+
+exports.getDomainReferral = getDomainReferral;
+function getDomainReferral(domain)
+{
+ if ("domainReferrals" in rules && domain in rules.domainReferrals)
+ return rules.domainReferrals[domain];
+ else
+ return null;
+}
+
+function applyExpression(string, expression)
+{
+ if (expression.nomatch && new RegExp(expression.nomatch).test(string))
+ return string;
+
+ return string.replace(new RegExp(expression.find, "g"), expression.replace);
+}
+
+function getSimilarStrings(input, dictionary, maxDistance, separator)
+{
+ // We use a non-deterministic final automaton to perform a search on all
+ // strings in the dictionary simultaneously (see
+ // http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata
+ // for the basic algorithm). However, we use Damerau-Levenshtein distance
+ // (transposition of two adjacent letters counts as one operation), meaning
+ // one additional transition for the automaton. The number of automaton states
+ // can theoretically be extremely large, the maxDistance parameter limits the
+ // number of states we need to process however. We process the input string
+ // backwards to allow matching domain names for a longer host name.
+
+ let results = [];
+
+ function processState(entry, distance, position, result)
+ {
+ let isString = (typeof entry == "string");
+
+ // Do we have a result?
+ if (position < 0 || input[position] == separator)
+ {
+ if (!isString && "" in entry)
+ results.push([result, input.substr(position + 1), distance, parseInt(entry[""], 10)]);
+ else if (isString && entry[0] == " ")
+ results.push([result, input.substr(position + 1), distance, parseInt(entry, 10)]);
+ }
+
+ // Maybe there is a match
+ if (position >= 0)
+ {
+ let nextChar = input[position];
+ if (!isString && nextChar in entry)
+ processState(entry[nextChar], distance, position - 1, nextChar + result);
+ else if (isString && entry[0] == nextChar)
+ processState(entry.substr(1), distance, position - 1, nextChar + result);
+ }
+
+ // Mistakes
+ if (distance < maxDistance)
+ {
+ // Deletion and substitution
+ if (!isString)
+ {
+ for (let c in entry)
+ {
+ if (c != "")
+ processState(entry[c], distance + 1, position, c + result);
+ if (c != "" && position >= 0)
+ processState(entry[c], distance + 1, position - 1, c + result);
+ }
+ }
+ else if (entry[0] != " ")
+ {
+ processState(entry.substr(1), distance + 1, position, entry[0] + result);
+ if (position >= 0)
+ processState(entry.substr(1), distance + 1, position - 1, entry[0] + result);
+ }
+
+ // Insertion
+ if (position >= 0)
+ processState(entry, distance + 1, position - 1, result);
+
+ // Transposition
+ if (position >= 1)
+ {
+ let nextChar1 = input[position];
+ let nextChar2 = input[position - 1];
+ if (isString)
+ {
+ if (entry[0] == nextChar2 && entry[1] == nextChar1)
+ processState(entry.substr(2), distance + 1, position - 2, nextChar1 + nextChar2 + result);
+ }
+ else if (nextChar2 in entry)
+ {
+ let nextEntry = entry[nextChar2];
+ if (typeof nextEntry != "string")
+ {
+ if (nextChar1 in nextEntry)
+ processState(nextEntry[nextChar1], distance + 1, position - 2, nextChar1 + nextChar2 + result);
+ }
+ else
+ {
+ if (nextEntry[0] == nextChar1)
+ processState(nextEntry.substr(1), distance + 1, position - 2, nextChar1 + nextChar2 + result);
+ }
+ }
+ }
+ }
+ }
+
+ processState(dictionary, 0, input.length - 1, "");
+ return results;
+}
+
+function getBestMatch(input, dictionary, maxDistance, separator)
+{
+ let suggestions = getSimilarStrings(input, dictionary, maxDistance, separator);
+
+ let bestSuggestion = null;
+ let bestSuggestionDistance;
+ let bestSuggestionMatched;
+ let bestSuggestionPriority;
+ for (let i = 0; i < suggestions.length; i++)
+ {
+ let [suggestion, matchedString, distance, priority] = suggestions[i];
+ if (suggestion == input)
+ return suggestion;
+
+ let matchedLen = matchedString.length;
+ if (priority < 0 && matchedLen == input.length)
+ {
+ // TLDs should never be proposed as a replacement for the entire host name
+ continue;
+ }
+
+ if (!bestSuggestion ||
+ bestSuggestionMatched < matchedLen ||
+ (bestSuggestionMatched == matchedLen && bestSuggestionDistance > distance) ||
+ (bestSuggestionMatched == matchedLen && bestSuggestionDistance == distance && bestSuggestionPriority < priority))
+ {
+ bestSuggestion = suggestion;
+ bestSuggestionDistance = distance;
+ bestSuggestionMatched = matchedLen;
+ bestSuggestionPriority = priority;
+ }
+ }
+ if (bestSuggestion)
+ return input.substr(0, input.length - bestSuggestionMatched) + bestSuggestion;
+ else
+ return input;
+}
« no previous file with comments | « lib/typoNetError.js ('k') | lib/typoSurvey.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld