lib/abp2blocklist.js - Issue 29340694: Issue 3956 - Convert domain whitelisting filters

Side by Side Diff: lib/abp2blocklist.js

Issue 29340694: Issue 3956 - Convert domain whitelisting filters (Closed)

Patch Set: Avoid creating so many temporary arrays Created May 17, 2016, 10:37 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * This file is part of Adblock Plus <https://adblockplus.org/>,	2 * This file is part of Adblock Plus <https://adblockplus.org/>,

3 * Copyright (C) 2006-2016 Eyeo GmbH	3 * Copyright (C) 2006-2016 Eyeo GmbH

4 *	4 *

5 * Adblock Plus is free software: you can redistribute it and/or modify	5 * Adblock Plus is free software: you can redistribute it and/or modify

6 * it under the terms of the GNU General Public License version 3 as	6 * it under the terms of the GNU General Public License version 3 as

7 * published by the Free Software Foundation.	7 * published by the Free Software Foundation.

8 *	8 *

9 * Adblock Plus is distributed in the hope that it will be useful,	9 * Adblock Plus is distributed in the hope that it will be useful,

10 * but WITHOUT ANY WARRANTY; without even the implied warranty of	10 * but WITHOUT ANY WARRANTY; without even the implied warranty of

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
59 let excluded = [];	59 let excluded = [];

60 let rules = [];	60 let rules = [];

61	61

62 parseDomains(filter.domains, included, excluded);	62 parseDomains(filter.domains, included, excluded);

63	63

64 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))	64 if (excluded.length == 0 && !(filter.selector in elemhideSelectorExceptions))

65 return {matchDomains: included.map(matchDomain), selector: filter.selector};	65 return {matchDomains: included.map(matchDomain), selector: filter.selector};

66 }	66 }

67	67

68 /**	68 /**

69 * Convert the given filter "regexpSource" string into a regular expression,	69 * Parse the given filter "regexpSource" string. Producing a regular expression,

70 * handling the conversion of unicode inside hostnames to punycode.	70 * extracting the hostname (if any), deciding if the regular expression is safe

71 * (Also deciding if the regular expression can be safely converted to and	71 * to be converted + matched as lower case and noting if the source contains

72 * matched as lower case or not.)	72 * anything after the hostname.)

73 *	73 *

74 * @param {string} text regexpSource property of a filter	74 * @param {string} text regexpSource property of a filter

75 * @returns {object} An object containing a regular expression string and a bool	75 * @returns {object} An object containing a regular expression string, a bool

76 * indicating if the filter can be safely matched as lower	76 * indicating if the filter can be safely matched as lower

77 * case: {regexp: "...", canSafelyMatchAsLowercase: true/false }	77 * case, a hostname string (or undefined) and a bool

	78 * indicating if the source only contains a hostname or not:

	79 * {regexp: "...",

	80 * canSafelyMatchAsLowercase: true/false,

	81 * hostname: "...",

	82 * justHostname: true/false}

78 */	83 */

79 function toRegExp(text)	84 function parseFilterRegexpSource(text)

80 {	85 {

81 let result = [];	86 let regexp = [];

82 let lastIndex = text.length - 1;	87 let lastIndex = text.length - 1;

	88 let hostname;

83 let hostnameStart = null;	89 let hostnameStart = null;

84 let hostnameFinished = false;	90 let hostnameFinished = false;

	91 let justHostname = false;

85 let canSafelyMatchAsLowercase = false;	92 let canSafelyMatchAsLowercase = false;

86	93

87 for (let i = 0; i < text.length; i++)	94 for (let i = 0; i < text.length; i++)

88 {	95 {

89 let c = text[i];	96 let c = text[i];

90	97

	98 if (hostnameFinished)

	99 justHostname = false;

	100

91 // If we're currently inside the hostname we have to be careful not to	101 // If we're currently inside the hostname we have to be careful not to

92 // escape any characters until after we have converted it to punycode.	102 // escape any characters until after we have converted it to punycode.

93 if (hostnameStart != null && !hostnameFinished)	103 if (hostnameStart != null && !hostnameFinished)

94 {	104 {

95 let endingChar = (c == "*" \|\| c == "^" \|\|	105 let endingChar = (c == "*" \|\| c == "^" \|\|

96 c == "?" \|\| c == "/" \|\| c == "\|");	106 c == "?" \|\| c == "/" \|\| c == "\|");

97 if (!endingChar && i != lastIndex)	107 if (!endingChar && i != lastIndex)

98 continue;	108 continue;

99	109

100 let hostname = text.substring(hostnameStart, endingChar ? i : i + 1);	110 hostname = punycode.toASCII(

101 hostnameFinished = true;	111 text.substring(hostnameStart, endingChar ? i : i + 1)

102 result.push(escapeRegExp(punycode.toASCII(hostname)));	112 );

	113 hostnameFinished = justHostname = true;

	114 regexp.push(escapeRegExp(hostname));

103 if (!endingChar)	115 if (!endingChar)

104 break;	116 break;

105 }	117 }

106	118

107 switch (c)	119 switch (c)

108 {	120 {

109 case "*":	121 case "*":

110 if (result.length > 0 && i < lastIndex && text[i + 1] != "*")	122 if (regexp.length > 0 && i < lastIndex && text[i + 1] != "*")

111 result.push(".*");	123 regexp.push(".*");

112 break;	124 break;

113 case "^":	125 case "^":

114 if (i < lastIndex)	126 if (i < lastIndex)

115 result.push(".");	127 regexp.push(".");

116 break;	128 break;

117 case "\|":	129 case "\|":

118 if (i == 0)	130 if (i == 0)

119 {	131 {

120 result.push("^");	132 regexp.push("^");

121 break;	133 break;

122 }	134 }

123 if (i == lastIndex)	135 if (i == lastIndex)

124 {	136 {

125 result.push("$");	137 regexp.push("$");

126 break;	138 break;

127 }	139 }

128 if (i == 1 && text[0] == "\|")	140 if (i == 1 && text[0] == "\|")

129 {	141 {

130 hostnameStart = i + 1;	142 hostnameStart = i + 1;

131 canSafelyMatchAsLowercase = true;	143 canSafelyMatchAsLowercase = true;

132 result.push("https?://");	144 regexp.push("https?://");

133 break;	145 break;

134 }	146 }

135 result.push("\\\|");	147 regexp.push("\\\|");

136 break;	148 break;

137 case "/":	149 case "/":

138 if (!hostnameFinished &&	150 if (!hostnameFinished &&

139 text.charAt(i-2) == ":" && text.charAt(i-1) == "/")	151 text.charAt(i-2) == ":" && text.charAt(i-1) == "/")

140 {	152 {

141 hostnameStart = i + 1;	153 hostnameStart = i + 1;

142 canSafelyMatchAsLowercase = true;	154 canSafelyMatchAsLowercase = true;

143 }	155 }

144 result.push("/");	156 regexp.push("/");

145 break;	157 break;

146 case ".": case "+": case "$": case "?":	158 case ".": case "+": case "$": case "?":

147 case "{": case "}": case "(": case ")":	159 case "{": case "}": case "(": case ")":

148 case "[": case "]": case "\\":	160 case "[": case "]": case "\\":

149 result.push("\\", c);	161 regexp.push("\\", c);

150 break;	162 break;

151 default:	163 default:

152 if (hostnameFinished && (c >= "a" && c <= "z" \|\|	164 if (hostnameFinished && (c >= "a" && c <= "z" \|\|

153 c >= "A" && c <= "Z"))	165 c >= "A" && c <= "Z"))

154 canSafelyMatchAsLowercase = false;	166 canSafelyMatchAsLowercase = false;

155 result.push(c);	167 regexp.push(c);

156 }	168 }

157 }	169 }

158	170

159 return {regexp: result.join(""),	171 return {

160 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase};	172 regexp: regexp.join(""),

161 }	173 canSafelyMatchAsLowercase: canSafelyMatchAsLowercase,

162	174 hostname: hostname,

163 function getRegExpTrigger(filter)	175 justHostname: justHostname

164 {	176 };

165 let result = toRegExp(filter.regexpSource);

166

167 let trigger = {"url-filter": result.regexp};

168

169 // Limit rules to to HTTP(S) URLs

170 if (!/^(\^\|http)/i.test(trigger["url-filter"]))

171 trigger["url-filter"] = "^https?://.*" + trigger["url-filter"];

172

173 // For rules containing only a hostname we know that we're matching against

174 // a lowercase string unless the matchCase option was passed.

175 if (result.canSafelyMatchAsLowercase && !filter.matchCase)

176 trigger["url-filter"] = trigger["url-filter"].toLowerCase();

177

178 if (result.canSafelyMatchAsLowercase \|\| filter.matchCase)

179 trigger["url-filter-is-case-sensitive"] = true;

180

181 return trigger;

182 }	177 }

183	178

184 function getResourceTypes(filter)	179 function getResourceTypes(filter)

185 {	180 {

186 let types = [];	181 let types = [];

187	182

188 if (filter.contentType & typeMap.IMAGE)	183 if (filter.contentType & typeMap.IMAGE)

189 types.push("image");	184 types.push("image");

190 if (filter.contentType & typeMap.STYLESHEET)	185 if (filter.contentType & typeMap.STYLESHEET)

191 types.push("style-sheet");	186 types.push("style-sheet");

(...skipping 24 matching lines...) Expand all Loading...
216 {	211 {

217 result.push(domain);	212 result.push(domain);

218	213

219 if (tldjs.getDomain(domain) == domain)	214 if (tldjs.getDomain(domain) == domain)

220 result.push("www." + domain);	215 result.push("www." + domain);

221 }	216 }

222	217

223 return result;	218 return result;

224 }	219 }

225	220

226 function convertFilter(filter, action, withResourceTypes)	221 function convertFilterAddRules(rules, filter, action, withResourceTypes)

227 {	222 {

228 let trigger = getRegExpTrigger(filter);	223 let parsed = parseFilterRegexpSource(filter.regexpSource);

	224

	225 // For the special case of $document whitelisting filters with just a domain

	226 // we can generate an equivalent blocking rule exception using if-domain.

	227 if (filter instanceof filterClasses.WhitelistFilter &&

	228 filter.contentType & typeMap.DOCUMENT &&

	229 parsed.justHostname)

	230 {

	231 rules.push({

	232 trigger: {

	233 "url-filter": ".*",

	234 "if-domain": addDomainPrefix([parsed.hostname])

	235 },

	236 action: {type: "ignore-previous-rules"}

	237 });

	238 // If the filter contains multiple options we'll need to generate further

	239 // rules for it, but if not we can simply return now.

	240 if (filter.contentType == typeMap.DOCUMENT)
	Sebastian Noack 2016/05/17 10:55:24 What if the filter is @@\|\|example.com$document,ele What if the filter is @@\|\|example.com$document,elemhide? Then we wouldn't bail out here, though we don't have to create another rule either. I think we have to check for IMAGE, SUBDOCUMENT, SCRIPT, etc. explicitly below. Perhaps move the respective mask to a const to avoid duplication. kzar 2016/05/17 11:23:34 Done. Show quoted text On 2016/05/17 10:55:24, Sebastian Noack wrote: > What if the filter is mailto:@@\|\|example.com$document,elemhide? > > Then we wouldn't bail out here, though we don't have to create another rule > either. I think we have to check for IMAGE, SUBDOCUMENT, SCRIPT, etc. explicitly > below. Perhaps move the respective mask to a const to avoid duplication. Done.
	241 return rules;

	242 }

	243

	244 let trigger = {"url-filter": parsed.regexp};

	245

	246 // Limit rules to HTTP(S) URLs

	247 if (!/^(\^\|http)/i.test(trigger["url-filter"]))

	248 trigger["url-filter"] = "^https?://.*" + trigger["url-filter"];

	249

	250 // For rules containing only a hostname we know that we're matching against

	251 // a lowercase string unless the matchCase option was passed.

	252 if (parsed.canSafelyMatchAsLowercase && !filter.matchCase)

	253 trigger["url-filter"] = trigger["url-filter"].toLowerCase();

	254

	255 if (parsed.canSafelyMatchAsLowercase \|\| filter.matchCase)

	256 trigger["url-filter-is-case-sensitive"] = true;

	257

229 let included = [];	258 let included = [];

230 let excluded = [];	259 let excluded = [];

231	260

232 parseDomains(filter.domains, included, excluded);	261 parseDomains(filter.domains, included, excluded);

233	262

234 if (withResourceTypes)	263 if (withResourceTypes)

235 trigger["resource-type"] = getResourceTypes(filter);	264 trigger["resource-type"] = getResourceTypes(filter);

236 if (filter.thirdParty != null)	265 if (filter.thirdParty != null)

237 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];	266 trigger["load-type"] = [filter.thirdParty ? "third-party" : "first-party"];

238	267

239 if (included.length > 0)	268 if (included.length > 0)

240 trigger["if-domain"] = addDomainPrefix(included);	269 trigger["if-domain"] = addDomainPrefix(included);

241 else if (excluded.length > 0)	270 else if (excluded.length > 0)

242 trigger["unless-domain"] = addDomainPrefix(excluded);	271 trigger["unless-domain"] = addDomainPrefix(excluded);

243	272

244 return {trigger: trigger, action: {type: action}};	273 rules.push({trigger: trigger, action: {type: action}});

	274

	275 return rules;
	Sebastian Noack 2016/05/17 10:55:24 You don't have to return the rules here anymore. You don't have to return the rules here anymore. kzar 2016/05/17 11:23:34 Oops, Done. Show quoted text On 2016/05/17 10:55:24, Sebastian Noack wrote: > You don't have to return the rules here anymore. Oops, Done.
245 }	276 }

246	277

247 function hasNonASCI(obj)	278 function hasNonASCI(obj)

248 {	279 {

249 if (typeof obj == "string")	280 if (typeof obj == "string")

250 {	281 {

251 if (/[^\x00-\x7F]/.test(obj))	282 if (/[^\x00-\x7F]/.test(obj))

252 return true;	283 return true;

253 }	284 }

254	285

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
345 return;	376 return;

346 if (filter instanceof filterClasses.RegExpFilter &&	377 if (filter instanceof filterClasses.RegExpFilter &&

347 filter.regexpSource == null)	378 filter.regexpSource == null)

348 return;	379 return;

349	380

350 if (filter instanceof filterClasses.BlockingFilter)	381 if (filter instanceof filterClasses.BlockingFilter)

351 this.requestFilters.push(filter);	382 this.requestFilters.push(filter);

352	383

353 if (filter instanceof filterClasses.WhitelistFilter)	384 if (filter instanceof filterClasses.WhitelistFilter)

354 {	385 {

355 if (filter.contentType & (typeMap.IMAGE	386 if (filter.contentType & (typeMap.DOCUMENT

	387 \| typeMap.IMAGE

356 \| typeMap.STYLESHEET	388 \| typeMap.STYLESHEET

357 \| typeMap.SCRIPT	389 \| typeMap.SCRIPT

358 \| typeMap.FONT	390 \| typeMap.FONT

359 \| typeMap.MEDIA	391 \| typeMap.MEDIA

360 \| typeMap.POPUP	392 \| typeMap.POPUP

361 \| typeMap.OBJECT	393 \| typeMap.OBJECT

362 \| typeMap.OBJECT_SUBREQUEST	394 \| typeMap.OBJECT_SUBREQUEST

363 \| typeMap.XMLHTTPREQUEST	395 \| typeMap.XMLHTTPREQUEST

364 \| typeMap.PING	396 \| typeMap.PING

365 \| typeMap.SUBDOCUMENT	397 \| typeMap.SUBDOCUMENT

(...skipping 19 matching lines...) Expand all Loading...
385	417

386 /**	418 /**

387 * Generate content blocker list for all filters that were added	419 * Generate content blocker list for all filters that were added

388 *	420 *

389 * @returns {Filter} filter Filter to convert	421 * @returns {Filter} filter Filter to convert

390 */	422 */

391 ContentBlockerList.prototype.generateRules = function(filter)	423 ContentBlockerList.prototype.generateRules = function(filter)

392 {	424 {

393 let rules = [];	425 let rules = [];

394	426

395 function addRule(rule)

396 {

397 if (!hasNonASCI(rule))

398 rules.push(rule);

399 }

400

401 let groupedElemhideFilters = new Map();	427 let groupedElemhideFilters = new Map();

402 for (let filter of this.elemhideFilters)	428 for (let filter of this.elemhideFilters)

403 {	429 {

404 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);	430 let result = convertElemHideFilter(filter, this.elemhideSelectorExceptions);

405 if (!result)	431 if (!result)

406 continue;	432 continue;

407	433

408 if (result.matchDomains.length == 0)	434 if (result.matchDomains.length == 0)

409 result.matchDomains = ["^https?://"];	435 result.matchDomains = ["^https?://"];

410	436

411 for (let matchDomain of result.matchDomains)	437 for (let matchDomain of result.matchDomains)

412 {	438 {

413 let group = groupedElemhideFilters.get(matchDomain) \|\| [];	439 let group = groupedElemhideFilters.get(matchDomain) \|\| [];

414 group.push(result.selector);	440 group.push(result.selector);

415 groupedElemhideFilters.set(matchDomain, group);	441 groupedElemhideFilters.set(matchDomain, group);

416 }	442 }

417 }	443 }

418	444

419 groupedElemhideFilters.forEach((selectors, matchDomain) =>	445 groupedElemhideFilters.forEach((selectors, matchDomain) =>

420 {	446 {

421 while (selectors.length)	447 while (selectors.length)

422 {	448 {

423 let selector = selectors.splice(0, selectorLimit).join(", ");	449 let selector = selectors.splice(0, selectorLimit).join(", ");

424	450

425 // As of Safari 9.0 element IDs are matched as lowercase. We work around	451 // As of Safari 9.0 element IDs are matched as lowercase. We work around

426 // this by converting to the attribute format [id="elementID"]	452 // this by converting to the attribute format [id="elementID"]

427 selector = convertIDSelectorsToAttributeSelectors(selector);	453 selector = convertIDSelectorsToAttributeSelectors(selector);

428	454

429 addRule({	455 rules.push({

430 trigger: {"url-filter": matchDomain,	456 trigger: {"url-filter": matchDomain,

431 "url-filter-is-case-sensitive": true},	457 "url-filter-is-case-sensitive": true},

432 action: {type: "css-display-none",	458 action: {type: "css-display-none",

433 selector: selector}	459 selector: selector}

434 });	460 });

435 }	461 }

436 });	462 });

437	463

438 for (let filter of this.elemhideExceptions)	464 for (let filter of this.elemhideExceptions)

439 addRule(convertFilter(filter, "ignore-previous-rules", false));	465 convertFilterAddRules(rules, filter, "ignore-previous-rules", false);

440 for (let filter of this.requestFilters)	466 for (let filter of this.requestFilters)

441 addRule(convertFilter(filter, "block", true));	467 convertFilterAddRules(rules, filter, "block", true);

442 for (let filter of this.requestExceptions)	468 for (let filter of this.requestExceptions)

443 addRule(convertFilter(filter, "ignore-previous-rules", true));	469 convertFilterAddRules(rules, filter, "ignore-previous-rules", true);

444	470

445 return rules;	471 return rules.filter(rule => !hasNonASCI(rule));

446 };	472 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »