OLD | NEW |
1 /* | 1 /* |
2 * This file is part of Adblock Plus <https://adblockplus.org/>, | 2 * This file is part of Adblock Plus <https://adblockplus.org/>, |
3 * Copyright (C) 2006-2017 eyeo GmbH | 3 * Copyright (C) 2006-2017 eyeo GmbH |
4 * | 4 * |
5 * Adblock Plus is free software: you can redistribute it and/or modify | 5 * Adblock Plus is free software: you can redistribute it and/or modify |
6 * it under the terms of the GNU General Public License version 3 as | 6 * it under the terms of the GNU General Public License version 3 as |
7 * published by the Free Software Foundation. | 7 * published by the Free Software Foundation. |
8 * | 8 * |
9 * Adblock Plus is distributed in the hope that it will be useful, | 9 * Adblock Plus is distributed in the hope that it will be useful, |
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of | 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 * GNU General Public License for more details. | 12 * GNU General Public License for more details. |
13 * | 13 * |
14 * You should have received a copy of the GNU General Public License | 14 * You should have received a copy of the GNU General Public License |
15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 15 * along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
16 */ | 16 */ |
17 | 17 |
18 /** @module abp2blocklist */ | 18 /** @module abp2blocklist */ |
19 | 19 |
20 "use strict"; | 20 "use strict"; |
21 | 21 |
| 22 const crypto = require("crypto"); |
| 23 |
22 let filterClasses = require("filterClasses"); | 24 let filterClasses = require("filterClasses"); |
23 let tldjs = require("tldjs"); | 25 let tldjs = require("tldjs"); |
24 let punycode = require("punycode"); | 26 let punycode = require("punycode"); |
25 | 27 |
26 const selectorLimit = 5000; | 28 const selectorLimit = 5000; |
27 const typeMap = filterClasses.RegExpFilter.typeMap; | 29 const typeMap = filterClasses.RegExpFilter.typeMap; |
28 const whitelistableRequestTypes = (typeMap.IMAGE | 30 const whitelistableRequestTypes = (typeMap.IMAGE |
29 | typeMap.STYLESHEET | 31 | typeMap.STYLESHEET |
30 | typeMap.SCRIPT | 32 | typeMap.SCRIPT |
31 | typeMap.FONT | 33 | typeMap.FONT |
(...skipping 327 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
359 { | 361 { |
360 newSelector.push(selector.substring(i, pos.start)); | 362 newSelector.push(selector.substring(i, pos.start)); |
361 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); | 363 newSelector.push('[id=', selector.substring(pos.start + 1, pos.end), ']'); |
362 i = pos.end; | 364 i = pos.end; |
363 } | 365 } |
364 newSelector.push(selector.substring(i)); | 366 newSelector.push(selector.substring(i)); |
365 | 367 |
366 return newSelector.join(""); | 368 return newSelector.join(""); |
367 } | 369 } |
368 | 370 |
| 371 function closeMatch(s, t, {singleCharacterOnly = false} = {}) |
| 372 { |
| 373 // This function returns an edit operation, one of "substitute", "delete", |
| 374 // and "insert", along with an index in the source string where the edit must |
| 375 // occur in order to arrive at the target string. If the strings are not a |
| 376 // close match, it returns null. |
| 377 |
| 378 // If singleCharacterOnly is false, deletions or insertions of a contiguous |
| 379 // range of characters from one string into the other, at the same index, are |
| 380 // treated as a single edit. For example, "internal" and "international" are |
| 381 // considered to be one edit apart, inserting the substring "tiona" from the |
| 382 // latter into the former. |
| 383 |
| 384 // A few things to note: |
| 385 // |
| 386 // 1) This function does not care about how the input strings are treated |
| 387 // by the caller. It only treats them as raw strings. For example, the |
| 388 // caller may treat them as regular expressions, where "[ab]" and "[bc]" |
| 389 // could be considered to have an edit distance of 1, since the order |
| 390 // within the brackets does not matter. This function will still return |
| 391 // null for this set of inputs since they are two edits apart. |
| 392 // |
| 393 // 2) To be friendly to calling code that might be passing in regular |
| 394 // expressions anyway, this function will simply return null if it |
| 395 // encounters a special character (e.g. "\", "?", "+", "*", etc.) in the |
| 396 // delta. For example, given "Hello" and "Hello, how are you?", it will |
| 397 // return null instead of "{type: 'insert', index: 5, endIndex: 19}". |
| 398 // |
| 399 // 3) The calling code within this file does indeed pass in regular |
| 400 // expressions (the strict subset of JavaScript regular expressions |
| 401 // supported by WebKit for content blockers), making the important |
| 402 // assumption that the parts where two such regular expressions may |
| 403 // differ can always be treated as normal strings. |
| 404 // |
| 405 // For example, "^https?://.*/ads" and "^https?://.*/adv" differ only in |
| 406 // the last character, therefore the regular expressions can safely be |
| 407 // merged into "^https?://.*/ad[sv]". If, for example, the characters in |
| 408 // the delta were to appear within square brackets originally in the |
| 409 // input strings (e.g. "^https?://.*/ad[sx]" and "^https?://.*/ad[vx]"), |
| 410 // the calling code would have to do extra work to merge the two regular |
| 411 // expressions correctly. The calling code within this file assumes that |
| 412 // this is never the case. |
| 413 |
| 414 let diff = s.length - t.length; |
| 415 |
| 416 // If the string lengths differ by more than one character, we cannot arrive |
| 417 // at target from source in a single edit operation. |
| 418 if (singleCharacterOnly && (diff < -1 || diff > 1)) |
| 419 return null; |
| 420 |
| 421 // If target is longer than source, swap them for the purpose of our |
| 422 // calculation. |
| 423 if (diff < 0) |
| 424 { |
| 425 let tmp = s; |
| 426 s = t; |
| 427 t = tmp; |
| 428 } |
| 429 |
| 430 let edit = null; |
| 431 |
| 432 // If the string lengths differ by only one character at most, use the simple |
| 433 // algorithm to find a single character edit. |
| 434 if (diff == 0 || diff == 1 || diff == -1) |
| 435 { |
| 436 for (let i = 0, j = 0; i < s.length; i++) |
| 437 { |
| 438 if (s[i] == t[j]) |
| 439 { |
| 440 j++; |
| 441 } |
| 442 else if (edit) |
| 443 { |
| 444 // Since we want one and only one edit operation, we must bail here. |
| 445 return null; |
| 446 } |
| 447 else if ((s[i] == "." || s[i] == "+" || s[i] == "$" || s[i] == "?" || |
| 448 s[i] == "{" || s[i] == "}" || s[i] == "(" || s[i] == ")" || |
| 449 s[i] == "[" || s[i] == "]" || s[i] == "\\") || |
| 450 (t[j] == "." || t[j] == "+" || t[j] == "$" || t[j] == "?" || |
| 451 t[j] == "{" || t[j] == "}" || t[j] == "(" || t[j] == ")" || |
| 452 t[j] == "[" || t[j] == "]" || t[j] == "\\")) |
| 453 { |
| 454 // We don't deal with special characters for now. |
| 455 return null; |
| 456 } |
| 457 else if (diff == 0) |
| 458 { |
| 459 // If both strings are equal in length, this is a substitution. |
| 460 edit = {type: "substitute", index: i}; |
| 461 j++; |
| 462 } |
| 463 else if (diff > 0) |
| 464 { |
| 465 // If the source string is longer, this is a deletion. |
| 466 edit = {type: "delete", index: i}; |
| 467 } |
| 468 else |
| 469 { |
| 470 edit = {type: "insert", index: i}; |
| 471 } |
| 472 } |
| 473 } |
| 474 else if (!singleCharacterOnly) |
| 475 { |
| 476 // Try another algorithm to find a multiple character deletion or |
| 477 // insertion. |
| 478 |
| 479 let i = 0, j = 0; |
| 480 |
| 481 for (; i < s.length; i++) |
| 482 { |
| 483 if (s[i] != t[i]) |
| 484 break; |
| 485 } |
| 486 |
| 487 for (; j < t.length; j++) |
| 488 { |
| 489 if (t.length - j == i || |
| 490 s[s.length - j - 1] != t[t.length - j - 1]) |
| 491 break; |
| 492 } |
| 493 |
| 494 if (i != t.length - j) |
| 495 return null; |
| 496 |
| 497 for (let k = i; k < s.length - j; k++) |
| 498 { |
| 499 // If there are any special characters in the delta, bail. |
| 500 if (s[k] == "." || s[k] == "+" || s[k] == "$" || s[k] == "?" || |
| 501 s[k] == "{" || s[k] == "}" || s[k] == "(" || s[k] == ")" || |
| 502 s[k] == "[" || s[k] == "]" || s[k] == "\\") |
| 503 return null; |
| 504 } |
| 505 |
| 506 if (diff > 0) |
| 507 { |
| 508 edit = {type: "delete", index: i, endIndex: s.length - j}; |
| 509 } |
| 510 else |
| 511 { |
| 512 edit = {type: "insert", index: i, endIndex: s.length - j}; |
| 513 } |
| 514 } |
| 515 |
| 516 return edit; |
| 517 } |
| 518 |
| 519 function mergeCloselyMatchingRules(rules, |
| 520 {advanced = false, exhaustive = false} = {}) |
| 521 { |
| 522 // Closely matching rules are likely to be within a certain range. We only |
| 523 // look for matches within this range. If we increase this value, it can give |
| 524 // us more matches and a smaller resulting rule set, but possibly at a |
| 525 // significant performance cost. |
| 526 const heuristicRange = 100; |
| 527 |
| 528 let rulesInfo = new Array(rules.length); |
| 529 |
| 530 rules.forEach((rule, index) => |
| 531 { |
| 532 rulesInfo[index] = {rule}; |
| 533 |
| 534 if (rule.action.type == "ignore-previous-rules") |
| 535 { |
| 536 rulesInfo[index].skip = true; |
| 537 } |
| 538 else |
| 539 { |
| 540 // Save a hash of the rule but without the URL filter. We use this for |
| 541 // comparison later. |
| 542 let copy = { |
| 543 trigger: Object.assign({}, rule.trigger), |
| 544 action: Object.assign({}, rule.action) |
| 545 }; |
| 546 |
| 547 delete copy.trigger["url-filter"]; |
| 548 |
| 549 rulesInfo[index].ruleHash = crypto.createHash("sha1") |
| 550 .update(JSON.stringify(copy)) |
| 551 .digest("hex") |
| 552 .substring(0, 8); |
| 553 } |
| 554 }); |
| 555 |
| 556 for (let i = 0; i < rules.length; i++) |
| 557 { |
| 558 if (rulesInfo[i].skip) |
| 559 continue; |
| 560 |
| 561 let limit = exhaustive ? rules.length : |
| 562 Math.min(i + heuristicRange, rules.length); |
| 563 |
| 564 for (let j = i + 1; j < limit; j++) |
| 565 { |
| 566 if (rulesInfo[j].skip) |
| 567 continue; |
| 568 |
| 569 // Check if the rules are identical except for the URL filter. |
| 570 if (rulesInfo[i].ruleHash == rulesInfo[j].ruleHash) |
| 571 { |
| 572 let source = rules[i].trigger["url-filter"]; |
| 573 let target = rules[j].trigger["url-filter"]; |
| 574 |
| 575 let edit = closeMatch(source, target, {singleCharacterOnly: !advanced}); |
| 576 |
| 577 if (edit) |
| 578 { |
| 579 let urlFilter, ruleInfo, match = {edit}; |
| 580 |
| 581 if (edit.type == "insert") |
| 582 { |
| 583 // Convert the insertion into a deletion and stick it on the target |
| 584 // rule instead. We can only group deletions and substitutions; |
| 585 // therefore insertions must be treated as deletions on the target |
| 586 // rule. |
| 587 urlFilter = target; |
| 588 ruleInfo = rulesInfo[j]; |
| 589 match.index = i; |
| 590 edit.type = "delete"; |
| 591 } |
| 592 else |
| 593 { |
| 594 urlFilter = source; |
| 595 ruleInfo = rulesInfo[i]; |
| 596 match.index = j; |
| 597 } |
| 598 |
| 599 // If the edit has an end index, it represents a multiple character |
| 600 // edit. |
| 601 let multiEdit = !!edit.endIndex; |
| 602 |
| 603 if (multiEdit) |
| 604 { |
| 605 // We only care about a single multiple character edit because the |
| 606 // number of characters for such a match doesn't matter, we can |
| 607 // only merge with one other rule. |
| 608 if (!ruleInfo.multiEditMatch) |
| 609 ruleInfo.multiEditMatch = match; |
| 610 } |
| 611 else |
| 612 { |
| 613 // For single character edits, multiple rules can be merged into |
| 614 // one. e.g. "ad", "ads", and "adv" can be merged into "ad[sv]?". |
| 615 if (!ruleInfo.matches) |
| 616 ruleInfo.matches = new Array(urlFilter.length + 1); |
| 617 |
| 618 // Matches at a particular index. For example, for a source string |
| 619 // "ads", both target strings "ad" (deletion) and "adv" |
| 620 // (substitution) match at index 2, hence they are grouped together |
| 621 // to possibly be merged later into "ad[sv]?". |
| 622 let matchesForIndex = ruleInfo.matches[edit.index]; |
| 623 |
| 624 if (matchesForIndex) |
| 625 { |
| 626 matchesForIndex.push(match); |
| 627 } |
| 628 else |
| 629 { |
| 630 matchesForIndex = [match]; |
| 631 ruleInfo.matches[edit.index] = matchesForIndex; |
| 632 } |
| 633 |
| 634 // Keep track of the best set of matches. We later sort by this to |
| 635 // get best results. |
| 636 if (!ruleInfo.bestMatches || |
| 637 matchesForIndex.length > ruleInfo.bestMatches.length) |
| 638 ruleInfo.bestMatches = matchesForIndex; |
| 639 } |
| 640 } |
| 641 } |
| 642 } |
| 643 } |
| 644 |
| 645 // Filter out rules that have no matches at all. |
| 646 let candidateRulesInfo = rulesInfo.filter(ruleInfo => |
| 647 { |
| 648 return ruleInfo.bestMatches || ruleInfo.multiEditMatch |
| 649 }); |
| 650 |
| 651 // For best results, we have to sort the candidates by the largest set of |
| 652 // matches. |
| 653 // |
| 654 // For example, we want "ads", "bds", "adv", "bdv", "adx", and "bdx" to |
| 655 // generate "ad[svx]" and "bd[svx]" (2 rules), not "[ab]ds", "[ab]dv", and |
| 656 // "[ab]dx" (3 rules). |
| 657 candidateRulesInfo.sort((ruleInfo1, ruleInfo2) => |
| 658 { |
| 659 let weight1 = ruleInfo1.bestMatches ? ruleInfo1.bestMatches.length : |
| 660 ruleInfo1.multiEditMatch ? 1 : 0; |
| 661 let weight2 = ruleInfo2.bestMatches ? ruleInfo2.bestMatches.length : |
| 662 ruleInfo2.multiEditMatch ? 1 : 0; |
| 663 |
| 664 return weight2 - weight1; |
| 665 }); |
| 666 |
| 667 for (let ruleInfo of candidateRulesInfo) |
| 668 { |
| 669 let rule = ruleInfo.rule; |
| 670 |
| 671 // If this rule has already been merged into another rule, we skip it. |
| 672 if (ruleInfo.merged) |
| 673 continue; |
| 674 |
| 675 // Find the best set of rules to group, which is simply the largest set. |
| 676 let best = (ruleInfo.matches || []).reduce((best, matchesForIndex) => |
| 677 { |
| 678 matchesForIndex = (matchesForIndex || []).filter(match => |
| 679 { |
| 680 // Filter out rules that have either already been merged into other |
| 681 // rules or have had other rules merged into them. |
| 682 return !rulesInfo[match.index].merged && |
| 683 !rulesInfo[match.index].mergedInto; |
| 684 }); |
| 685 |
| 686 return matchesForIndex.length > best.length ? matchesForIndex : best; |
| 687 }, |
| 688 []); |
| 689 |
| 690 let multiEdit = false; |
| 691 |
| 692 // If we couldn't find a single rule to merge with, let's see if we have a |
| 693 // multiple character edit. e.g. we could merge "ad" and "adserver" into |
| 694 // "ad(server)?". |
| 695 if (best.length == 0 && ruleInfo.multiEditMatch && |
| 696 !rulesInfo[ruleInfo.multiEditMatch.index].merged && |
| 697 !rulesInfo[ruleInfo.multiEditMatch.index].mergedInto) |
| 698 { |
| 699 best = [ruleInfo.multiEditMatch]; |
| 700 multiEdit = true; |
| 701 } |
| 702 |
| 703 if (best.length > 0) |
| 704 { |
| 705 let urlFilter = rule.trigger["url-filter"]; |
| 706 |
| 707 let editIndex = best[0].edit.index; |
| 708 |
| 709 if (!multiEdit) |
| 710 { |
| 711 // Merge all the matching rules into this one. |
| 712 |
| 713 let characters = []; |
| 714 let quantifier = ""; |
| 715 |
| 716 for (let match of best) |
| 717 { |
| 718 if (match.edit.type == "delete") |
| 719 { |
| 720 quantifier = "?"; |
| 721 } |
| 722 else |
| 723 { |
| 724 let character = rules[match.index].trigger["url-filter"][editIndex]; |
| 725 characters.push(character); |
| 726 } |
| 727 |
| 728 // Mark the target rule as merged so other rules don't try to merge |
| 729 // it again. |
| 730 rulesInfo[match.index].merged = true; |
| 731 } |
| 732 |
| 733 urlFilter = urlFilter.substring(0, editIndex + 1) + quantifier + |
| 734 urlFilter.substring(editIndex + 1); |
| 735 if (characters.length > 0) |
| 736 { |
| 737 urlFilter = urlFilter.substring(0, editIndex) + "[" + |
| 738 urlFilter[editIndex] + characters.join("") + "]" + |
| 739 urlFilter.substring(editIndex + 1); |
| 740 } |
| 741 } |
| 742 else |
| 743 { |
| 744 let editEndIndex = best[0].edit.endIndex; |
| 745 |
| 746 // Mark the target rule as merged so other rules don't try to merge it |
| 747 // again. |
| 748 rulesInfo[best[0].index].merged = true; |
| 749 |
| 750 urlFilter = urlFilter.substring(0, editIndex) + "(" + |
| 751 urlFilter.substring(editIndex, editEndIndex) + ")?" + |
| 752 urlFilter.substring(editEndIndex); |
| 753 } |
| 754 |
| 755 rule.trigger["url-filter"] = urlFilter; |
| 756 |
| 757 // Mark this rule as one that has had other rules merged into it. |
| 758 ruleInfo.mergedInto = true; |
| 759 } |
| 760 } |
| 761 |
| 762 // Filter out rules that have been merged into other rules. |
| 763 return rulesInfo.filter(ruleInfo => !ruleInfo.merged) |
| 764 .map(ruleInfo => ruleInfo.rule); |
| 765 } |
| 766 |
369 let ContentBlockerList = | 767 let ContentBlockerList = |
370 /** | 768 /** |
371 * Create a new Adblock Plus filter to content blocker list converter | 769 * Create a new Adblock Plus filter to content blocker list converter |
372 * | 770 * |
373 * @constructor | 771 * @constructor |
374 */ | 772 */ |
375 exports.ContentBlockerList = function () | 773 exports.ContentBlockerList = function () |
376 { | 774 { |
377 this.requestFilters = []; | 775 this.requestFilters = []; |
378 this.requestExceptions = []; | 776 this.requestExceptions = []; |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
417 | 815 |
418 parseDomains(filter.domains, domains, []); | 816 parseDomains(filter.domains, domains, []); |
419 } | 817 } |
420 }; | 818 }; |
421 | 819 |
422 /** | 820 /** |
423 * Generate content blocker list for all filters that were added | 821 * Generate content blocker list for all filters that were added |
424 * | 822 * |
425 * @returns {Filter} filter Filter to convert | 823 * @returns {Filter} filter Filter to convert |
426 */ | 824 */ |
427 ContentBlockerList.prototype.generateRules = function(filter) | 825 ContentBlockerList.prototype.generateRules = function({ |
| 826 merge = false, |
| 827 fastMerge = true, |
| 828 advancedMerge, |
| 829 exhaustiveMerge |
| 830 } = {}) |
428 { | 831 { |
429 let rules = []; | 832 let rules = []; |
430 | 833 |
431 let groupedElemhideFilters = new Map(); | 834 let groupedElemhideFilters = new Map(); |
432 for (let filter of this.elemhideFilters) | 835 for (let filter of this.elemhideFilters) |
433 { | 836 { |
434 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); | 837 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions); |
435 if (!result) | 838 if (!result) |
436 continue; | 839 continue; |
437 | 840 |
(...skipping 27 matching lines...) Expand all Loading... |
465 } | 868 } |
466 }); | 869 }); |
467 | 870 |
468 for (let filter of this.elemhideExceptions) | 871 for (let filter of this.elemhideExceptions) |
469 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); | 872 convertFilterAddRules(rules, filter, "ignore-previous-rules", false); |
470 for (let filter of this.requestFilters) | 873 for (let filter of this.requestFilters) |
471 convertFilterAddRules(rules, filter, "block", true); | 874 convertFilterAddRules(rules, filter, "block", true); |
472 for (let filter of this.requestExceptions) | 875 for (let filter of this.requestExceptions) |
473 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); | 876 convertFilterAddRules(rules, filter, "ignore-previous-rules", true); |
474 | 877 |
475 return rules.filter(rule => !hasNonASCI(rule)); | 878 rules = rules.filter(rule => !hasNonASCI(rule)); |
| 879 |
| 880 if (merge) |
| 881 { |
| 882 // If the more specific options are specified (e.g. "advanced" and |
| 883 // "exhaustive"), they override the more general options (e.g. "fast"). |
| 884 let mergeOptions = { |
| 885 advanced: advancedMerge || (!fastMerge && advancedMerge != false), |
| 886 exhaustive: exhaustiveMerge || (!fastMerge && exhaustiveMerge != false) |
| 887 }; |
| 888 |
| 889 rules = mergeCloselyMatchingRules(rules, mergeOptions); |
| 890 } |
| 891 |
| 892 return rules; |
476 }; | 893 }; |
OLD | NEW |