Index: lib/abp2blocklist.js |
=================================================================== |
--- a/lib/abp2blocklist.js |
+++ b/lib/abp2blocklist.js |
@@ -42,16 +42,54 @@ |
typeMap.WEBRTC | |
typeMap.OBJECT_SUBREQUEST | |
typeMap.PING | |
typeMap.OTHER; |
const whitelistableRequestTypes = httpRequestTypes | |
typeMap.WEBSOCKET | |
typeMap.WEBRTC; |
+function callLater(func) |
+{ |
+ return new Promise(resolve => |
+ { |
+ let call = () => resolve(func()); |
+ |
+ // If this looks like Node.js, call process.nextTick, otherwise call |
+ // setTimeout. |
+ if (typeof process != "undefined") |
+ process.nextTick(call); |
+ else |
+ setTimeout(call, 0); |
+ }); |
+} |
+ |
+function async(funcs) |
+{ |
+ if (!Array.isArray(funcs)) |
+ funcs = Array.from(arguments); |
+ |
+ let lastPause = Date.now(); |
+ |
+ return funcs.reduce((promise, next) => promise.then(() => |
+ { |
+ // If it has been 100ms or longer since the last call, take a pause. This |
+ // keeps the browser from freezing up. |
+ let now = Date.now(); |
+ if (now - lastPause >= 100) |
+ { |
+ lastPause = now; |
+ return callLater(next); |
+ } |
+ |
+ return next(); |
+ }), |
+ Promise.resolve()); |
+} |
+ |
function parseDomains(domains, included, excluded) |
{ |
for (let domain in domains) |
{ |
if (domain != "") |
{ |
let enabled = domains[domain]; |
domain = punycode.toASCII(domain.toLowerCase()); |
@@ -609,24 +647,505 @@ |
if (unlessDomain) |
rule.trigger["unless-domain"] = unlessDomain; |
rules.push(rule); |
} |
} |
+/** |
+ * Check if two strings are a close match |
+ * |
+ * This function returns an edit operation, one of "substitute", "delete", and |
+ * "insert", along with an index in the source string where the edit must occur |
+ * in order to arrive at the target string. If the strings are not a close |
+ * match, it returns null. |
+ * |
+ * Two strings are considered to be a close match if they are one edit |
+ * operation apart. |
+ * |
+ * Deletions or insertions of a contiguous range of characters from one string |
+ * into the other, at the same index, are treated as a single edit. For |
+ * example, "internal" and "international" are considered to be one edit apart |
+ * and therefore a close match. |
+ * |
+ * A few things to note: |
+ * |
+ * 1) This function does not care about the format of the input strings. For |
+ * example, the caller may pass in regular expressions, where "[ab]" and |
+ * "[bc]" could be considered to be a close match, since the order within the |
+ * brackets doesn't matter. This function will still return null for this set |
+ * of inputs since they are two edits apart. |
+ * |
+ * 2) To be friendly to calling code that might be passing in regular |
+ * expressions, this function will simply return null if it encounters a |
+ * special character (e.g. "\", "?", "+", etc.) in the delta. For example, |
+ * given "Hello" and "Hello, how are you?", it will return null. |
+ * |
+ * 3) If the caller does indeed pass in regular expressions, it must make the |
+ * important assumption that the parts where two such regular expressions may |
+ * differ can always be treated as normal strings. For example, |
+ * "^https?://example.com/ads" and "^https?://example.com/adv" differ only in |
+ * the last character, therefore the regular expressions can safely be merged |
+ * into "^https?://example.com/ad[sv]". |
+ * |
+ * @param {string} s The source string |
+ * @param {string} t The target string |
+ * |
+ * @returns {object} An object describing the single edit operation that must |
+ * occur in the source string in order to arrive at the |
+ * target string |
+ */ |
+function closeMatch(s, t) |
+{ |
+ let diff = s.length - t.length; |
+ |
+ // If target is longer than source, swap them for the purpose of our |
+ // calculation. |
+ if (diff < 0) |
+ { |
+ let tmp = s; |
+ s = t; |
+ t = tmp; |
+ } |
+ |
+ let edit = null; |
+ |
+ let i = 0; |
+ let j = 0; |
+ |
+ // Start from the beginning and keep going until we hit a character that |
+ // doesn't match. |
+ for (; i < s.length; i++) |
+ { |
+ if (s[i] != t[i]) |
+ break; |
+ } |
+ |
+ // Now do exactly the same from the end, but also stop if we reach the |
+ // position where we terminated the previous loop. |
+ for (; j < t.length; j++) |
+ { |
+ if (t.length - j == i || s[s.length - j - 1] != t[t.length - j - 1]) |
+ break; |
+ } |
+ |
+ if (diff == 0) |
+ { |
+ // If the strings are equal in length and the delta isn't exactly one |
+ // character, it's not a close match. |
+ if (t.length - j - i != 1) |
+ return null; |
+ } |
+ else if (i != t.length - j) |
+ { |
+ // For strings of unequal length, if we haven't found a match for every |
+ // single character in the shorter string counting from both the beginning |
+ // and the end, it's not a close match. |
+ return null; |
+ } |
+ |
+ for (let k = i; k < s.length - j; k++) |
+ { |
+ // If the delta contains any special characters, it's not a close match. |
+ if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" || |
+ s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" || |
+ s[k] == "[" || s[k] == "]" || s[k] == "\\") |
+ return null; |
+ } |
+ |
+ if (diff == 0) |
+ { |
+ edit = {type: "substitute", index: i}; |
+ } |
+ else if (diff > 0) |
+ { |
+ edit = {type: "delete", index: i}; |
+ |
+ if (diff > 1) |
+ edit.endIndex = s.length - j; |
+ } |
+ else |
+ { |
+ edit = {type: "insert", index: i}; |
+ |
+ if (diff < -1) |
+ edit.endIndex = s.length - j; |
+ } |
+ |
+ return edit; |
+} |
+ |
+function eliminateRedundantRulesByURLFilter(rulesInfo, exhaustive) |
+{ |
+ const heuristicRange = 1000; |
+ |
+ let ol = rulesInfo.length; |
+ |
+ // Throw out obviously redundant rules. |
+ return async(rulesInfo.map((ruleInfo, index) => () => |
+ { |
+ // If this rule is already marked as redundant, don't bother comparing it |
+ // with other rules. |
+ if (rulesInfo[index].redundant) |
+ return; |
+ |
+ let limit = exhaustive ? rulesInfo.length : |
+ Math.min(index + heuristicRange, rulesInfo.length); |
+ |
+ for (let i = index, j = i + 1; j < limit; j++) |
+ { |
+ if (rulesInfo[j].redundant) |
+ continue; |
+ |
+ let source = rulesInfo[i].rule.trigger["url-filter"]; |
+ let target = rulesInfo[j].rule.trigger["url-filter"]; |
+ |
+ if (source.length >= target.length) |
+ { |
+ // If one URL filter is a substring of the other starting at the |
+ // beginning, the other one is clearly redundant. |
+ if (source.substring(0, target.length) == target) |
+ { |
+ rulesInfo[i].redundant = true; |
+ break; |
+ } |
+ } |
+ else if (target.substring(0, source.length) == source) |
+ { |
+ rulesInfo[j].redundant = true; |
+ } |
+ } |
+ })) |
+ .then(() => rulesInfo.filter(ruleInfo => !ruleInfo.redundant)); |
+} |
+ |
+function findMatchesForRuleByURLFilter(rulesInfo, index, exhaustive) |
+{ |
+ // Closely matching rules are likely to be within a certain range. We only |
+ // look for matches within this range by default. If we increase this value, |
+ // it can give us more matches and a smaller resulting rule set, but possibly |
+ // at a significant performance cost. |
+ // |
+ // If the exhaustive option is true, we simply ignore this value and look for |
+ // matches throughout the rule set. |
+ const heuristicRange = 1000; |
+ |
+ let limit = exhaustive ? rulesInfo.length : |
+ Math.min(index + heuristicRange, rulesInfo.length); |
+ |
+ for (let i = index, j = i + 1; j < limit; j++) |
+ { |
+ let source = rulesInfo[i].rule.trigger["url-filter"]; |
+ let target = rulesInfo[j].rule.trigger["url-filter"]; |
+ |
+ let edit = closeMatch(source, target); |
+ |
+ if (edit) |
+ { |
+ let urlFilter, ruleInfo, match = {edit}; |
+ |
+ if (edit.type == "insert") |
+ { |
+ // Convert the insertion into a deletion and stick it on the target |
+ // rule instead. We can only group deletions and substitutions; |
+ // therefore insertions must be treated as deletions on the target |
+ // rule. |
+ urlFilter = target; |
+ ruleInfo = rulesInfo[j]; |
+ match.index = i; |
+ edit.type = "delete"; |
+ } |
+ else |
+ { |
+ urlFilter = source; |
+ ruleInfo = rulesInfo[i]; |
+ match.index = j; |
+ } |
+ |
+ // If the edit has an end index, it represents a multiple character |
+ // edit. |
+ let multiEdit = !!edit.endIndex; |
+ |
+ if (multiEdit) |
+ { |
+ // We only care about a single multiple character edit because the |
+ // number of characters for such a match doesn't matter, we can |
+ // only merge with one other rule. |
+ if (!ruleInfo.multiEditMatch) |
+ ruleInfo.multiEditMatch = match; |
+ } |
+ else |
+ { |
+ // For single character edits, multiple rules can be merged into |
+ // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?". |
+ if (!ruleInfo.matches) |
+ ruleInfo.matches = new Array(urlFilter.length); |
+ |
+ // Matches at a particular index. For example, for a source string |
+ // "ads", both target strings "ad" (deletion) and "adv" |
+ // (substitution) match at index 2, hence they are grouped together |
+ // to possibly be merged later into "ad[sv]?". |
+ let matchesForIndex = ruleInfo.matches[edit.index]; |
+ |
+ if (matchesForIndex) |
+ { |
+ matchesForIndex.push(match); |
+ } |
+ else |
+ { |
+ matchesForIndex = [match]; |
+ ruleInfo.matches[edit.index] = matchesForIndex; |
+ } |
+ |
+ // Keep track of the best set of matches. We later sort by this to |
+ // get best results. |
+ if (!ruleInfo.bestMatches || |
+ matchesForIndex.length > ruleInfo.bestMatches.length) |
+ ruleInfo.bestMatches = matchesForIndex; |
+ } |
+ } |
+ } |
+} |
+ |
+function mergeCandidateRulesByURLFilter(rulesInfo) |
+{ |
+ // Filter out rules that have no matches at all. |
+ let candidateRulesInfo = rulesInfo.filter(ruleInfo => |
+ { |
+ return ruleInfo.bestMatches || ruleInfo.multiEditMatch |
+ }); |
+ |
+ // For best results, we have to sort the candidates by the largest set of |
+ // matches. |
+ // |
+ // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to |
+ // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and |
+ // "[ab]dx" (3 rules). |
+ candidateRulesInfo.sort((ruleInfo1, ruleInfo2) => |
+ { |
+ let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length : |
+ ruleInfo1.multiEditMatch ? 1 : 0; |
+ let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length : |
+ ruleInfo2.multiEditMatch ? 1 : 0; |
+ |
+ return weight2 - weight1; |
+ }); |
+ |
+ for (let ruleInfo of candidateRulesInfo) |
+ { |
+ let rule = ruleInfo.rule; |
+ |
+ // If this rule has already been merged into another rule, we skip it. |
+ if (ruleInfo.merged) |
+ continue; |
+ |
+ // Find the best set of rules to group, which is simply the largest set. |
+ let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) => |
+ { |
+ matchesForIndex = (matchesForIndex || []).filter(match => |
+ { |
+ // Filter out rules that have either already been merged into other |
+ // rules or have had other rules merged into them. |
+ return !rulesInfo[match.index].merged && |
+ !rulesInfo[match.index].mergedInto; |
+ }); |
+ |
+ return matchesForIndex.length > best.length ? matchesForIndex : best; |
+ }, |
+ []); |
+ |
+ let multiEdit = false; |
+ |
+ // If we couldn't find a single rule to merge with, let's see if we have a |
+ // multiple character edit. e.g. we could merge "ad" and "adserver" into |
+ // "ad(server)?". |
+ if (best.length == 0 && ruleInfo.multiEditMatch && |
+ !rulesInfo[ruleInfo.multiEditMatch.index].merged && |
+ !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto) |
+ { |
+ best = [ruleInfo.multiEditMatch]; |
+ multiEdit = true; |
+ } |
+ |
+ if (best.length > 0) |
+ { |
+ let urlFilter = rule.trigger["url-filter"]; |
+ |
+ let editIndex = best[0].edit.index; |
+ |
+ if (!multiEdit) |
+ { |
+ // Merge all the matching rules into this one. |
+ |
+ let characters = [urlFilter[editIndex]]; |
+ let quantifier = ""; |
+ |
+ for (let match of best) |
+ { |
+ if (match.edit.type == "delete") |
+ { |
+ quantifier = "?"; |
+ } |
+ else |
+ { |
+ let character = rulesInfo[match.index].rule |
+ .trigger["url-filter"][editIndex]; |
+ |
+ // Insert any hyphen at the beginning so it gets interpreted as a |
+ // literal hyphen. |
+ if (character == "-") |
+ characters.unshift(character); |
+ else |
+ characters.push(character); |
+ } |
+ |
+ // Mark the target rule as merged so other rules don't try to merge |
+ // it again. |
+ rulesInfo[match.index].merged = true; |
+ } |
+ |
+ urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier + |
+ urlFilter.substring(editIndex + 1); |
+ if (characters.length > 1) |
+ { |
+ urlFilter = urlFilter.substring(0, editIndex) + "[" + |
+ characters.join("") + "]" + |
+ urlFilter.substring(editIndex + 1); |
+ } |
+ } |
+ else |
+ { |
+ let editEndIndex = best[0].edit.endIndex; |
+ |
+ // Mark the target rule as merged so other rules don't try to merge it |
+ // again. |
+ rulesInfo[best[0].index].merged = true; |
+ |
+ urlFilter = urlFilter.substring(0, editIndex) + "(" + |
+ urlFilter.substring(editIndex, editEndIndex) + ")?" + |
+ urlFilter.substring(editEndIndex); |
+ } |
+ |
+ rule.trigger["url-filter"] = urlFilter; |
+ |
+ // Mark this rule as one that has had other rules merged into it. |
+ ruleInfo.mergedInto = true; |
+ } |
+ } |
+} |
+ |
+function mergeRulesByURLFilter(rulesInfo, exhaustive) |
+{ |
+ return async(rulesInfo.map((ruleInfo, index) => () => |
+ findMatchesForRuleByURLFilter(rulesInfo, index, exhaustive) |
+ )) |
+ .then(() => mergeCandidateRulesByURLFilter(rulesInfo)); |
+} |
+ |
+function mergeRulesByArrayProperty(rulesInfo, propertyType, property) |
+{ |
+ if (rulesInfo.length <= 1) |
+ return; |
+ |
+ let valueSet = new Set(rulesInfo[0].rule[propertyType][property]); |
+ |
+ for (let i = 1; i < rulesInfo.length; i++) |
+ { |
+ for (let value of rulesInfo[i].rule[propertyType][property] || []) |
+ valueSet.add(value); |
+ |
+ rulesInfo[i].merged = true; |
+ } |
+ |
+ if (valueSet.size > 0) |
+ rulesInfo[0].rule[propertyType][property] = Array.from(valueSet); |
+ |
+ rulesInfo[0].mergedInto = true; |
+} |
+ |
+function groupRulesByMergeableProperty(rulesInfo, propertyType, property) |
+{ |
+ let mergeableRulesInfoByGroup = new Map(); |
+ |
+ for (let ruleInfo of rulesInfo) |
+ { |
+ let copy = { |
+ trigger: Object.assign({}, ruleInfo.rule.trigger), |
+ action: Object.assign({}, ruleInfo.rule.action) |
+ }; |
+ |
+ delete copy[propertyType][property]; |
+ |
+ let groupKey = JSON.stringify(copy); |
+ |
+ let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey); |
+ |
+ if (mergeableRulesInfo) |
+ mergeableRulesInfo.push(ruleInfo); |
+ else |
+ mergeableRulesInfoByGroup.set(groupKey, [ruleInfo]); |
+ } |
+ |
+ return mergeableRulesInfoByGroup; |
+} |
+ |
+function mergeRules(rules, exhaustive) |
+{ |
+ let rulesInfo = rules.map(rule => ({rule})); |
+ |
+ let arrayPropertiesToMergeBy = ["resource-type", "if-domain"]; |
+ |
+ return async(() => |
+ { |
+ let map = groupRulesByMergeableProperty(rulesInfo, "trigger", "url-filter"); |
+ return async(Array.from(map.values()).map(mergeableRulesInfo => () => |
kzar
2017/07/25 12:18:53
If async always took a sequence as the first argum
Manish Jethani
2017/07/28 09:17:36
That's a good suggestion.
If the async function t
|
+ eliminateRedundantRulesByURLFilter(mergeableRulesInfo, exhaustive) |
+ .then(rulesInfo => mergeRulesByURLFilter(rulesInfo, exhaustive)) |
+ )) |
+ .then(() => |
+ { |
+ // Filter out rules that are redundant or have been merged into other |
+ // rules. |
+ rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.redundant && |
+ !ruleInfo.merged); |
+ }); |
+ }) |
+ .then(() => async(arrayPropertiesToMergeBy.map(arrayProperty => () => |
+ { |
+ let map = groupRulesByMergeableProperty(rulesInfo, "trigger", |
+ arrayProperty); |
+ return async(Array.from(map.values()).map(mergeableRulesInfo => () => |
+ mergeRulesByArrayProperty(mergeableRulesInfo, "trigger", arrayProperty) |
+ )) |
+ .then(() => |
+ { |
+ rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged); |
+ }); |
+ }))) |
+ .then(() => rulesInfo.map(ruleInfo => ruleInfo.rule)); |
+} |
+ |
let ContentBlockerList = |
/** |
* Create a new Adblock Plus filter to content blocker list converter |
* |
+ * @param {object} options Options for content blocker list generation |
+ * |
* @constructor |
*/ |
-exports.ContentBlockerList = function () |
+exports.ContentBlockerList = function (options) |
{ |
+ const defaultOptions = { |
+ merge: "auto" |
+ }; |
+ |
+ this.options = Object.assign({}, defaultOptions, options); |
+ |
this.requestFilters = []; |
this.requestExceptions = []; |
this.elemhideFilters = []; |
this.elemhideExceptions = []; |
this.genericblockExceptions = []; |
this.generichideExceptions = []; |
this.elemhideSelectorExceptions = new Map(); |
}; |
@@ -671,22 +1190,26 @@ |
domains = this.elemhideSelectorExceptions[filter.selector] = []; |
parseDomains(filter.domains, domains, []); |
} |
}; |
/** |
* Generate content blocker list for all filters that were added |
- * |
- * @returns {Filter} filter Filter to convert |
*/ |
-ContentBlockerList.prototype.generateRules = function(filter) |
+ContentBlockerList.prototype.generateRules = function() |
{ |
- let rules = []; |
+ let cssRules = []; |
+ let cssExceptionRules = []; |
+ let blockingRules = []; |
+ let blockingExceptionRules = []; |
+ |
+ let ruleGroups = [cssRules, cssExceptionRules, |
+ blockingRules, blockingExceptionRules]; |
let genericSelectors = []; |
let groupedElemhideFilters = new Map(); |
for (let filter of this.elemhideFilters) |
{ |
let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); |
if (!result) |
@@ -723,35 +1246,57 @@ |
let genericSelectorExceptionDomains = |
extractFilterDomains(this.generichideExceptions); |
elemhideExceptionDomains.forEach(name => |
{ |
genericSelectorExceptionDomains.add(name); |
}); |
- addCSSRules(rules, genericSelectors, "^https?://", |
+ addCSSRules(cssRules, genericSelectors, "^https?://", |
genericSelectorExceptionDomains); |
groupedElemhideFilters.forEach((selectors, matchDomain) => |
{ |
- addCSSRules(rules, selectors, matchDomain, elemhideExceptionDomains); |
+ addCSSRules(cssRules, selectors, matchDomain, elemhideExceptionDomains); |
}); |
let requestFilterExceptionDomains = []; |
for (let filter of this.genericblockExceptions) |
{ |
let parsed = parseFilterRegexpSource(filter.regexpSource); |
if (parsed.hostname) |
requestFilterExceptionDomains.push(parsed.hostname); |
} |
for (let filter of this.requestFilters) |
{ |
- convertFilterAddRules(rules, filter, "block", true, |
+ convertFilterAddRules(blockingRules, filter, "block", true, |
requestFilterExceptionDomains); |
} |
for (let filter of this.requestExceptions) |
- convertFilterAddRules(rules, filter, "ignore-previous-rules", true); |
+ convertFilterAddRules(blockingExceptionRules, filter, |
kzar
2017/07/25 12:18:53
Nit: Please use braces for this for loop since it
Manish Jethani
2017/07/28 09:17:36
Done.
|
+ "ignore-previous-rules", true); |
+ |
+ return async(ruleGroups.map((group, index) => () => |
+ { |
+ let next = () => |
+ { |
+ if (index == ruleGroups.length - 1) |
+ return ruleGroups.reduce((all, rules) => all.concat(rules), []); |
+ }; |
- return rules; |
+ if (this.options.merge == "all" || |
+ (this.options.merge == "auto" && |
+ ruleGroups.reduce((n, group) => n + group.length, 0) > 50000)) |
+ { |
+ return mergeRules(ruleGroups[index], this.options.merge == "all") |
+ .then(rules => |
+ { |
+ ruleGroups[index] = rules; |
+ return next(); |
+ }); |
+ } |
+ |
+ return next(); |
+ })); |
}; |