Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: lib/abp2blocklist.js

Issue 29426594: Issue 3673 - Merge closely matching rules (Closed) Base URL: https://hg.adblockplus.org/abp2blocklist
Patch Set: Move merge options to constructor and update comments Created May 8, 2017, 11:09 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * This file is part of Adblock Plus <https://adblockplus.org/>, 2 * This file is part of Adblock Plus <https://adblockplus.org/>,
3 * Copyright (C) 2006-2017 eyeo GmbH 3 * Copyright (C) 2006-2017 eyeo GmbH
4 * 4 *
5 * Adblock Plus is free software: you can redistribute it and/or modify 5 * Adblock Plus is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 3 as 6 * it under the terms of the GNU General Public License version 3 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * Adblock Plus is distributed in the hope that it will be useful, 9 * Adblock Plus is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
(...skipping 348 matching lines...) Expand 10 before | Expand all | Expand 10 after
359 { 359 {
360 newSelector.push(selector.substring(i, pos.start)); 360 newSelector.push(selector.substring(i, pos.start));
361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); 361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']');
362 i = pos.end; 362 i = pos.end;
363 } 363 }
364 newSelector.push(selector.substring(i)); 364 newSelector.push(selector.substring(i));
365 365
366 return newSelector.join(""); 366 return newSelector.join("");
367 } 367 }
368 368
369 /**
370 * Check if two strings are a close match
371 *
372 * This function returns an edit operation, one of "substitute", "delete", and
373 * "insert", along with an index in the source string where the edit must occur
374 * in order to arrive at the target string. If the strings are not a close
375 * match, it returns null.
376 *
377 * Two strings are considered to be a close match if they are one edit
378 * operation apart.
379 *
380 * Deletions or insertions of a contiguous range of characters from one string
381 * into the other, at the same index, are treated as a single edit. For
382 * example, "internal" and "international" are considered to be one edit apart
383 * and therefore a close match.
384 *
385 * A few things to note:
386 *
387 * 1) This function does not care about the format of the input strings. For
388 * example, the caller may pass in regular expressions, where "[ab]" and
389 * "[bc]" could be considered to be a close match, since the order within the
390 * brackets doesn't matter. This function will still return null for this set
391 * of inputs since they are two edits apart.
392 *
393 * 2) To be friendly to calling code that might be passing in regular
394 * expressions, this function will simply return null if it encounters a
395 * special character (e.g. "\", "?", "+", etc.) in the delta. For example,
396 * given "Hello" and "Hello, how are you?", it will return null.
397 *
398 * 3) If the caller does indeed pass in regular expressions, it must make the
399 * important assumption that the parts where two such regular expressions may
400 * differ can always be treated as normal strings. For example,
401 * "^https?://example.com/ads" and "^https?://example.com/adv" differ only in
402 * the last character, therefore the regular expressions can safely be merged
403 * into "^https?://example.com/ad[sv]".
404 *
405 * @param {string} s The source string
406 * @param {string} t The target string
407 *
408 * @returns {object} An object describing the single edit operation that must
409 * occur in the source string in order to arrive at the
410 * target string
411 */
412 function closeMatch(s, t)
413 {
414 let diff = s.length - t.length;
415
416 // If target is longer than source, swap them for the purpose of our
417 // calculation.
418 if (diff < 0)
419 {
420 let tmp = s;
421 s = t;
422 t = tmp;
423 }
424
425 let edit = null;
426
427 let i = 0, j = 0;
428
429 // Start from the beginning and keep going until we hit a character that
430 // doesn't match.
431 for (; i < s.length; i++)
432 {
433 if (s[i] != t[i])
434 break;
435 }
436
437 // Now do exactly the same from the end, but also stop if we reach the
438 // position where we terminated the previous loop.
439 for (; j < t.length; j++)
440 {
441 if (t.length - j == i || s[s.length - j - 1] != t[t.length - j - 1])
442 break;
443 }
444
445 if (diff == 0)
446 {
447 // If the strings are equal in length and the delta isn't exactly one
448 // character, it's not a close match.
449 if (t.length - j - i != 1)
450 return null;
451 }
452 else if (i != t.length - j)
453 {
454 // For strings of unequal length, if we haven't found a match for every
455 // single character in the shorter string counting from both the beginning
456 // and the end, it's not a close match.
457 return null;
458 }
459
460 for (let k = i; k < s.length - j; k++)
461 {
462 // If the delta contains any special characters, it's not a close match.
463 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" ||
464 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" ||
465 s[k] == "[" || s[k] == "]" || s[k] == "\\")
466 return null;
467 }
468
469 if (diff == 0)
470 {
471 edit = {type: "substitute", index: i};
472 }
473 else if (diff > 0)
474 {
475 edit = {type: "delete", index: i};
476
477 if (diff > 1)
478 edit.endIndex = s.length - j;
479 }
480 else
481 {
482 edit = {type: "insert", index: i};
483
484 if (diff < -1)
485 edit.endIndex = s.length - j;
486 }
487
488 return edit;
489 }
490
491 function eliminateRedundantRulesByURLFilter(rulesInfo)
492 {
493 for (let i = 0; i < rulesInfo.length; i++)
494 {
495 // If this rule is already marked as redundant, don't bother comparing it
496 // with other rules.
497 if (rulesInfo[i].redundant)
498 continue;
499
500 for (let j = i + 1; j < rulesInfo.length; j++)
501 {
502 if (rulesInfo[j].redundant)
503 continue;
504
505 let source = rulesInfo[i].rule.trigger["url-filter"];
506 let target = rulesInfo[j].rule.trigger["url-filter"];
507
508 if (source.length >= target.length)
509 {
510 // If one URL filter is a substring of the other starting at the
511 // beginning, the other one is clearly redundant.
512 if (source.substring(0, target.length) == target)
513 {
514 rulesInfo[i].redundant = true;
515 break;
516 }
517 }
518 else if (target.substring(0, source.length) == source)
519 {
520 rulesInfo[j].redundant = true;
521 }
522 }
523 }
524
525 return rulesInfo.filter(ruleInfo => !ruleInfo.redundant);
526 }
527
528 function mergeRulesByURLFilter(rulesInfo, exhaustive)
529 {
530 // Closely matching rules are likely to be within a certain range. We only
531 // look for matches within this range by default. If we increase this value,
532 // it can give us more matches and a smaller resulting rule set, but possibly
533 // at a significant performance cost.
534 //
535 // If the exhaustive option is true, we simply ignore this value and look for
536 // matches throughout the rule set.
537 const heuristicRange = 10;
538
539 if (exhaustive)
540 {
541 // Throw out obviously redundant rules.
542 rulesInfo = eliminateRedundantRulesByURLFilter(rulesInfo);
543 }
544
545 if (rulesInfo.length <= 1)
546 return;
547
548 for (let i = 0; i < rulesInfo.length; i++)
549 {
550 let limit = exhaustive ? rulesInfo.length :
551 Math.min(i + heuristicRange, rulesInfo.length);
552
553 for (let j = i + 1; j < limit; j++)
554 {
555 let source = rulesInfo[i].rule.trigger["url-filter"];
556 let target = rulesInfo[j].rule.trigger["url-filter"];
557
558 let edit = closeMatch(source, target);
559
560 if (edit)
561 {
562 let urlFilter, ruleInfo, match = {edit};
563
564 if (edit.type == "insert")
565 {
566 // Convert the insertion into a deletion and stick it on the target
567 // rule instead. We can only group deletions and substitutions;
568 // therefore insertions must be treated as deletions on the target
569 // rule.
570 urlFilter = target;
571 ruleInfo = rulesInfo[j];
572 match.index = i;
573 edit.type = "delete";
574 }
575 else
576 {
577 urlFilter = source;
578 ruleInfo = rulesInfo[i];
579 match.index = j;
580 }
581
582 // If the edit has an end index, it represents a multiple character
583 // edit.
584 let multiEdit = !!edit.endIndex;
585
586 if (multiEdit)
587 {
588 // We only care about a single multiple character edit because the
589 // number of characters for such a match doesn't matter, we can
590 // only merge with one other rule.
591 if (!ruleInfo.multiEditMatch)
592 ruleInfo.multiEditMatch = match;
593 }
594 else
595 {
596 // For single character edits, multiple rules can be merged into
597 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?".
598 if (!ruleInfo.matches)
599 ruleInfo.matches = new Array(urlFilter.length);
600
601 // Matches at a particular index. For example, for a source string
602 // "ads", both target strings "ad" (deletion) and "adv"
603 // (substitution) match at index 2, hence they are grouped together
604 // to possibly be merged later into "ad[sv]?".
605 let matchesForIndex = ruleInfo.matches[edit.index];
606
607 if (matchesForIndex)
608 {
609 matchesForIndex.push(match);
610 }
611 else
612 {
613 matchesForIndex = [match];
614 ruleInfo.matches[edit.index] = matchesForIndex;
615 }
616
617 // Keep track of the best set of matches. We later sort by this to
618 // get best results.
619 if (!ruleInfo.bestMatches ||
620 matchesForIndex.length > ruleInfo.bestMatches.length)
621 ruleInfo.bestMatches = matchesForIndex;
622 }
623 }
624 }
625 }
626
627 // Filter out rules that have no matches at all.
628 let candidateRulesInfo = rulesInfo.filter(ruleInfo =>
629 {
630 return ruleInfo.bestMatches || ruleInfo.multiEditMatch
631 });
632
633 // For best results, we have to sort the candidates by the largest set of
634 // matches.
635 //
636 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to
637 // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and
638 // "[ab]dx" (3 rules).
639 candidateRulesInfo.sort((ruleInfo1, ruleInfo2) =>
640 {
641 let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length :
642 ruleInfo1.multiEditMatch ? 1 : 0;
643 let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length :
644 ruleInfo2.multiEditMatch ? 1 : 0;
645
646 return weight2 - weight1;
647 });
648
649 for (let ruleInfo of candidateRulesInfo)
650 {
651 let rule = ruleInfo.rule;
652
653 // If this rule has already been merged into another rule, we skip it.
654 if (ruleInfo.merged)
655 continue;
656
657 // Find the best set of rules to group, which is simply the largest set.
658 let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) =>
659 {
660 matchesForIndex = (matchesForIndex || []).filter(match =>
661 {
662 // Filter out rules that have either already been merged into other
663 // rules or have had other rules merged into them.
664 return !rulesInfo[match.index].merged &&
665 !rulesInfo[match.index].mergedInto;
666 });
667
668 return matchesForIndex.length > best.length ? matchesForIndex : best;
669 },
670 []);
671
672 let multiEdit = false;
673
674 // If we couldn't find a single rule to merge with, let's see if we have a
675 // multiple character edit. e.g. we could merge "ad" and "adserver" into
676 // "ad(server)?".
677 if (best.length == 0 && ruleInfo.multiEditMatch &&
678 !rulesInfo[ruleInfo.multiEditMatch.index].merged &&
679 !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto)
680 {
681 best = [ruleInfo.multiEditMatch];
682 multiEdit = true;
683 }
684
685 if (best.length > 0)
686 {
687 let urlFilter = rule.trigger["url-filter"];
688
689 let editIndex = best[0].edit.index;
690
691 if (!multiEdit)
692 {
693 // Merge all the matching rules into this one.
694
695 let characters = [];
696 let quantifier = "";
697
698 for (let match of best)
699 {
700 if (match.edit.type == "delete")
701 {
702 quantifier = "?";
703 }
704 else
705 {
706 let character = rulesInfo[match.index].rule
707 .trigger["url-filter"][editIndex];
708 characters.push(character);
709 }
710
711 // Mark the target rule as merged so other rules don't try to merge
712 // it again.
713 rulesInfo[match.index].merged = true;
714 }
715
716 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier +
717 urlFilter.substring(editIndex + 1);
718 if (characters.length > 0)
719 {
720 urlFilter = urlFilter.substring(0, editIndex) + "[" +
721 urlFilter[editIndex] + characters.join("") + "]" +
722 urlFilter.substring(editIndex + 1);
723 }
724 }
725 else
726 {
727 let editEndIndex = best[0].edit.endIndex;
728
729 // Mark the target rule as merged so other rules don't try to merge it
730 // again.
731 rulesInfo[best[0].index].merged = true;
732
733 urlFilter = urlFilter.substring(0, editIndex) + "(" +
734 urlFilter.substring(editIndex, editEndIndex) + ")?" +
735 urlFilter.substring(editEndIndex);
736 }
737
738 rule.trigger["url-filter"] = urlFilter;
739
740 // Mark this rule as one that has had other rules merged into it.
741 ruleInfo.mergedInto = true;
742 }
743 }
744 }
745
746 function mergeRulesByArrayProperty(rulesInfo, propertyType, property)
747 {
748 if (rulesInfo.length <= 1)
749 return;
750
751 let set = new Set();
752
753 rulesInfo.forEach((ruleInfo, index) =>
754 {
755 if (ruleInfo.rule[propertyType][property])
756 {
757 for (let value of ruleInfo.rule[propertyType][property])
758 set.add(value);
759 }
760
761 if (index > 0)
762 ruleInfo.merged = true;
763 });
764
765 if (set.size > 0)
766 rulesInfo[0].rule[propertyType][property] = Array.from(set);
767
768 rulesInfo[0].mergedInto = true;
769 }
770
771 function groupRulesByMergeableProperty(rulesInfo, propertyType, property)
772 {
773 let mergeableRulesInfoByGroup = new Map();
774
775 rulesInfo.forEach(ruleInfo =>
776 {
777 let copy = {
778 trigger: Object.assign({}, ruleInfo.rule.trigger),
779 action: Object.assign({}, ruleInfo.rule.action)
780 };
781
782 delete copy[propertyType][property];
783
784 let groupKey = JSON.stringify(copy);
785
786 let mergeableRulesInfo = mergeableRulesInfoByGroup.get(groupKey);
787
788 if (mergeableRulesInfo)
789 mergeableRulesInfo.push(ruleInfo);
790 else
791 mergeableRulesInfoByGroup.set(groupKey, [ruleInfo]);
792 });
793
794 return mergeableRulesInfoByGroup;
795 }
796
797 function mergeRules(rules, options)
798 {
799 const defaultOptions = {exhaustive: false};
kzar 2017/05/09 10:05:47 Have defaultOptions be a property on ContentBlocke
Manish Jethani 2017/05/09 15:52:46 Actually the options for the ContentBlockerList co
800
801 options = Object.assign({}, defaultOptions, options);
802
803 let rulesInfo = rules.map(rule => ({rule}));
804
805 groupRulesByMergeableProperty(rulesInfo, "trigger", "url-filter")
806 .forEach(mergeableRulesInfo =>
807 {
808 if (mergeableRulesInfo.length > 1)
809 mergeRulesByURLFilter(mergeableRulesInfo, options.exhaustive);
810 });
811
812 // Filter out rules that are redundant or have been merged into other rules.
813 rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.redundant &&
814 !ruleInfo.merged);
815
816 for (let arrayProperty of ["resource-type", "if-domain"])
817 {
818 groupRulesByMergeableProperty(rulesInfo, "trigger", arrayProperty)
819 .forEach(mergeableRulesInfo =>
820 {
821 if (mergeableRulesInfo.length > 1)
822 mergeRulesByArrayProperty(mergeableRulesInfo, "trigger", arrayProperty);
823 });
824
825 rulesInfo = rulesInfo.filter(ruleInfo => !ruleInfo.merged);
826 }
827
828 return rulesInfo.map(ruleInfo => ruleInfo.rule);
829 }
830
369 let ContentBlockerList = 831 let ContentBlockerList =
370 /** 832 /**
371 * Create a new Adblock Plus filter to content blocker list converter 833 * Create a new Adblock Plus filter to content blocker list converter
372 * 834 *
835 * @param {object} options Options for content blocker list generation
836 *
373 * @constructor 837 * @constructor
374 */ 838 */
375 exports.ContentBlockerList = function () 839 exports.ContentBlockerList = function(options)
376 { 840 {
841 const defaultOptions = {
842 merge: false,
843 exhaustiveMerge: false
844 };
845
846 this.options = Object.assign({}, defaultOptions, options);
847
377 this.requestFilters = []; 848 this.requestFilters = [];
378 this.requestExceptions = []; 849 this.requestExceptions = [];
379 this.elemhideFilters = []; 850 this.elemhideFilters = [];
380 this.elemhideExceptions = []; 851 this.elemhideExceptions = [];
381 this.elemhideSelectorExceptions = new Map(); 852 this.elemhideSelectorExceptions = new Map();
382 }; 853 };
383 854
384 /** 855 /**
385 * Add Adblock Plus filter to be converted 856 * Add Adblock Plus filter to be converted
386 * 857 *
(...skipping 27 matching lines...) Expand all
414 let domains = this.elemhideSelectorExceptions[filter.selector]; 885 let domains = this.elemhideSelectorExceptions[filter.selector];
415 if (!domains) 886 if (!domains)
416 domains = this.elemhideSelectorExceptions[filter.selector] = []; 887 domains = this.elemhideSelectorExceptions[filter.selector] = [];
417 888
418 parseDomains(filter.domains, domains, []); 889 parseDomains(filter.domains, domains, []);
419 } 890 }
420 }; 891 };
421 892
422 /** 893 /**
423 * Generate content blocker list for all filters that were added 894 * Generate content blocker list for all filters that were added
424 *
425 * @returns {Filter} filter Filter to convert
426 */ 895 */
427 ContentBlockerList.prototype.generateRules = function(filter) 896 ContentBlockerList.prototype.generateRules = function()
428 { 897 {
429 let rules = []; 898 let rules = [];
430 899
431 let groupedElemhideFilters = new Map(); 900 let groupedElemhideFilters = new Map();
432 for (let filter of this.elemhideFilters) 901 for (let filter of this.elemhideFilters)
433 { 902 {
434 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); 903 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);
435 if (!result) 904 if (!result)
436 continue; 905 continue;
437 906
(...skipping 27 matching lines...) Expand all
465 } 934 }
466 }); 935 });
467 936
468 for (let filter of this.elemhideExceptions) 937 for (let filter of this.elemhideExceptions)
469 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); 938 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);
470 for (let filter of this.requestFilters) 939 for (let filter of this.requestFilters)
471 convertFilterAddRules(rules, filter, "block", true); 940 convertFilterAddRules(rules, filter, "block", true);
472 for (let filter of this.requestExceptions) 941 for (let filter of this.requestExceptions)
473 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); 942 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);
474 943
475 return rules.filter(rule => !hasNonASCI(rule)); 944 rules = rules.filter(rule => !hasNonASCI(rule));
945
946 if (this.options.merge)
947 rules = mergeRules(rules, {exhaustive: this.options.exhaustiveMerge});
kzar 2017/05/09 10:05:47 Why wrap the exhaustiveMerge option in an Object h
Manish Jethani 2017/05/09 15:52:47 This is because mergeRules takes an option called
kzar 2017/05/09 16:50:58 Well if you passed through the value of exhaustive
Manish Jethani 2017/05/09 17:32:11 Done.
948
949 return rules;
476 }; 950 };
OLDNEW
« no previous file with comments | « abp2blocklist.js ('k') | test/abp2blocklist.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld