Left: | ||
Right: |
OLD | NEW |
---|---|
1 # This file is part of Adblock Plus <https://adblockplus.org/>, | 1 # This file is part of Adblock Plus <https://adblockplus.org/>, |
2 # Copyright (C) 2006-present eyeo GmbH | 2 # Copyright (C) 2006-present eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
133 | 133 |
134 | 134 |
135 Header = _line_type('Header', 'version', '[{.version}]') | 135 Header = _line_type('Header', 'version', '[{.version}]') |
136 EmptyLine = _line_type('EmptyLine', '', '') | 136 EmptyLine = _line_type('EmptyLine', '', '') |
137 Comment = _line_type('Comment', 'text', '! {.text}') | 137 Comment = _line_type('Comment', 'text', '! {.text}') |
138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') | 138 Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
139 Filter = _line_type('Filter', 'text selector action options', '{.text}') | 139 Filter = _line_type('Filter', 'text selector action options', '{.text}') |
140 Include = _line_type('Include', 'target', '%include {0.target}%') | 140 Include = _line_type('Include', 'target', '%include {0.target}%') |
141 | 141 |
142 | 142 |
143 METADATA_REGEXP = re.compile(r'(.*?)\s*:\s*(.*)') | 143 METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)') |
144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') | 144 INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
145 HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) | 145 HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]$', |
Sebastian Noack
2018/09/15 16:08:32
I changed this regular epxressions like this in my
Vasily Kuznetsov
2018/09/17 10:40:27
Yeah, you're right. I think the logic of parse_lin
Vasily Kuznetsov
2018/09/18 12:41:14
Done.
| |
146 flags=re.I) | |
146 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') | 147 HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
147 FILTER_OPTIONS_REGEXP = re.compile( | 148 FILTER_OPTIONS_REGEXP = re.compile( |
148 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' | 149 r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' |
149 ) | 150 ) |
150 | 151 |
151 | 152 |
152 def _parse_header(text): | 153 def _parse_header(text): |
153 match = HEADER_REGEXP.match(text) | 154 match = HEADER_REGEXP.match(text) |
154 if not match: | 155 version = match.group(1) if match else None |
156 if not version: | |
155 raise ParseError('Malformed header', text) | 157 raise ParseError('Malformed header', text) |
156 return Header(match.group(1)) | 158 return Header(version) |
157 | 159 |
158 | 160 |
159 def _parse_instruction(text): | 161 def _parse_instruction(text): |
160 match = INCLUDE_REGEXP.match(text) | 162 match = INCLUDE_REGEXP.match(text) |
161 if not match: | 163 if not match: |
162 raise ParseError('Unrecognized instruction', text) | 164 raise ParseError('Unrecognized instruction', text) |
163 return Include(match.group(1)) | 165 return Include(match.group(1)) |
164 | 166 |
165 | 167 |
166 def _parse_option(option): | 168 def _parse_option(option): |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
244 Parsed filter. | 246 Parsed filter. |
245 | 247 |
246 """ | 248 """ |
247 if '#' in text: | 249 if '#' in text: |
248 match = HIDING_FILTER_REGEXP.search(text) | 250 match = HIDING_FILTER_REGEXP.search(text) |
249 if match: | 251 if match: |
250 return _parse_hiding_filter(text, *match.groups()) | 252 return _parse_hiding_filter(text, *match.groups()) |
251 return _parse_blocking_filter(text) | 253 return _parse_blocking_filter(text) |
252 | 254 |
253 | 255 |
254 def parse_line(line_text): | 256 def parse_line(line_text, mode='body'): |
Sebastian Noack
2018/09/15 16:08:32
Having the "mode" as part of the public API, requi
Vasily Kuznetsov
2018/09/17 10:40:27
I would like to keep the mode in the public API. T
| |
255 """Parse one line of a filter list. | 257 """Parse one line of a filter list. |
256 | 258 |
257 Note that parse_line() doesn't handle special comments, hence never returns | 259 The types of lines that that the parser recognizes depend on the mode. In |
258 a Metadata() object, Adblock Plus only considers metadata when parsing the | 260 body mode the parser only recognizes filters, comments, processing |
259 whole filter list and only if they are given at the top of the filter list. | 261 instructions and empty lines. In medata mode it in addition recognizes |
262 metadata. In start mode it also recognizes headers. | |
263 | |
264 Note: checksum metadata lines are recognized in all modes for backwards | |
265 compatibility. Historically, checksums can occur at the bottom of the | |
266 filter list. They are are no longer used by Adblock Plus, but in order to | |
267 strip them (in abp.filters.renderer), we have to make sure to still parse | |
268 them regardless of their position in the filter list. | |
260 | 269 |
261 Parameters | 270 Parameters |
262 ---------- | 271 ---------- |
263 line_text : str | 272 line_text : str |
264 Line of a filter list. | 273 Line of a filter list. |
274 mode : str | |
275 Parsing mode, one of "start", "metadata" or "body" (default). | |
265 | 276 |
266 Returns | 277 Returns |
267 ------- | 278 ------- |
268 namedtuple | 279 namedtuple |
269 Parsed line (see `_line_type`). | 280 Parsed line (see `_line_type`). |
270 | 281 |
271 Raises | 282 Raises |
272 ------ | 283 ------ |
273 ParseError | 284 ParseError |
274 ParseError: If the line can't be parsed. | 285 ParseError: If the line can't be parsed. |
286 | |
275 """ | 287 """ |
288 MODES = {'body', 'start', 'metadata'} | |
289 if mode not in MODES: | |
290 raise ValueError('mode should be one of {}'.format(MODES)) | |
291 | |
276 if isinstance(line_text, type(b'')): | 292 if isinstance(line_text, type(b'')): |
277 line_text = line_text.decode('utf-8') | 293 line_text = line_text.decode('utf-8') |
278 | 294 |
279 content = line_text.strip() | 295 content = line_text.strip() |
Sebastian Noack
2018/09/15 16:08:32
Adblock Plus doesn't strip the line before process
Vasily Kuznetsov
2018/09/17 10:40:27
The behavior of ABP for the headers seems right. I
Sebastian Noack
2018/09/17 18:11:52
Adblock Plus extracts metadata (and the header) be
Vasily Kuznetsov
2018/09/18 12:41:14
Acknowledged.
| |
280 | 296 |
281 if content == '': | 297 if content == '': |
282 line = EmptyLine() | 298 return EmptyLine() |
283 elif content.startswith('!'): | |
284 line = Comment(content[1:].lstrip()) | |
285 elif content.startswith('%') and content.endswith('%'): | |
286 line = _parse_instruction(content) | |
287 elif content.startswith('[') and content.endswith(']'): | |
288 line = _parse_header(content) | |
289 else: | |
290 line = parse_filter(content) | |
291 | 299 |
292 assert line.to_string().replace(' ', '') == content.replace(' ', '') | 300 if content.startswith('!'): |
293 return line | 301 match = METADATA_REGEXP.match(line_text) |
302 if match: | |
303 key, value = match.groups() | |
304 if mode != 'body' or key.lower() == 'checksum': | |
Sebastian Noack
2018/09/15 16:08:32
We probably should keep the comment why we treat c
Vasily Kuznetsov
2018/09/17 10:40:27
I would like to also keep the note about checksums
Sebastian Noack
2018/09/17 18:11:52
I didn't notice that you moved that note to the do
Vasily Kuznetsov
2018/09/18 12:41:14
It needs to be in the docstring because it's part
| |
305 return Metadata(key, value) | |
306 return Comment(content[1:].lstrip()) | |
307 | |
308 if content.startswith('%') and content.endswith('%'): | |
309 return _parse_instruction(content) | |
310 | |
311 if mode == 'start' and content.startswith('[') and content.endswith(']'): | |
312 return _parse_header(content) | |
313 | |
314 return parse_filter(content) | |
294 | 315 |
295 | 316 |
296 def parse_filterlist(lines): | 317 def parse_filterlist(lines): |
297 """Parse filter list from an iterable. | 318 """Parse filter list from an iterable. |
298 | 319 |
299 Parameters | 320 Parameters |
300 ---------- | 321 ---------- |
301 lines: iterable of str | 322 lines: iterable of str |
302 Lines of the filter list. | 323 Lines of the filter list. |
303 | 324 |
304 Returns | 325 Returns |
305 ------- | 326 ------- |
306 iterator of namedtuple | 327 iterator of namedtuple |
307 Parsed lines of the filter list. | 328 Parsed lines of the filter list. |
308 | 329 |
309 Raises | 330 Raises |
310 ------ | 331 ------ |
311 ParseError | 332 ParseError |
312 Thrown during iteration for invalid filter list lines. | 333 Thrown during iteration for invalid filter list lines. |
313 TypeError | 334 TypeError |
314 If `lines` is not iterable. | 335 If `lines` is not iterable. |
315 | 336 |
316 """ | 337 """ |
317 metadata_closed = False | 338 mode = 'start' |
Sebastian Noack
2018/09/17 18:11:52
Maybe "position" would be more accurate name for t
Vasily Kuznetsov
2018/09/18 12:41:14
Yeah, "position" is a better name. I changed it.
| |
318 | 339 |
319 for line in lines: | 340 for line in lines: |
320 result = parse_line(line) | 341 parsed_line = parse_line(line, mode) |
342 yield parsed_line | |
321 | 343 |
322 if result.type == 'comment': | 344 if mode != 'body' and parsed_line.type in {'header', 'metadata'}: |
323 match = METADATA_REGEXP.match(result.text) | 345 # Continue parsing metadata if it's not over... |
324 if match: | 346 mode = 'metadata' |
325 key, value = match.groups() | 347 else: |
326 | 348 # ...otherwise switch to parsing filter list body. |
327 # Historically, checksums can occur at the bottom of the | 349 mode = 'body' |
328 # filter list. Checksums are no longer used by Adblock Plus, | |
329 # but in order to strip them (in abp.filters.renderer), | |
330 # we have to make sure to still parse them regardless of | |
331 # their position in the filter list. | |
332 if not metadata_closed or key.lower() == 'checksum': | |
333 result = Metadata(key, value) | |
334 | |
335 if result.type not in {'header', 'metadata'}: | |
336 metadata_closed = True | |
337 | |
338 yield result | |
OLD | NEW |