Index: abp/filters/parser.py |
=================================================================== |
--- a/abp/filters/parser.py |
+++ b/abp/filters/parser.py |
@@ -135,30 +135,32 @@ |
Header = _line_type('Header', 'version', '[{.version}]') |
EmptyLine = _line_type('EmptyLine', '', '') |
Comment = _line_type('Comment', 'text', '! {.text}') |
Metadata = _line_type('Metadata', 'key value', '! {0.key}: {0.value}') |
Filter = _line_type('Filter', 'text selector action options', '{.text}') |
Include = _line_type('Include', 'target', '%include {0.target}%') |
-METADATA_REGEXP = re.compile(r'(.*?)\s*:\s*(.*)') |
+METADATA_REGEXP = re.compile(r'\s*!\s*(.*?)\s*:\s*(.*)') |
INCLUDE_REGEXP = re.compile(r'%include\s+(.+)%') |
-HEADER_REGEXP = re.compile(r'\[(Adblock(?:\s*Plus\s*[\d\.]+?)?)\]', flags=re.I) |
+HEADER_REGEXP = re.compile(r'\[(?:(Adblock(?:\s*Plus\s*[\d\.]+?)?)|.*)\]$', |
Sebastian Noack
2018/09/15 16:08:32
I changed this regular epxressions like this in my
Vasily Kuznetsov
2018/09/17 10:40:27
Yeah, you're right. I think the logic of parse_lin
Vasily Kuznetsov
2018/09/18 12:41:14
Done.
|
+ flags=re.I) |
HIDING_FILTER_REGEXP = re.compile(r'^([^/*|@"!]*?)#([@?])?#(.+)$') |
FILTER_OPTIONS_REGEXP = re.compile( |
r'\$(~?[\w-]+(?:=[^,]+)?(?:,~?[\w-]+(?:=[^,]+)?)*)$' |
) |
def _parse_header(text): |
match = HEADER_REGEXP.match(text) |
- if not match: |
+ version = match.group(1) if match else None |
+ if not version: |
raise ParseError('Malformed header', text) |
- return Header(match.group(1)) |
+ return Header(version) |
def _parse_instruction(text): |
match = INCLUDE_REGEXP.match(text) |
if not match: |
raise ParseError('Unrecognized instruction', text) |
return Include(match.group(1)) |
@@ -246,56 +248,75 @@ |
""" |
if '#' in text: |
match = HIDING_FILTER_REGEXP.search(text) |
if match: |
return _parse_hiding_filter(text, *match.groups()) |
return _parse_blocking_filter(text) |
-def parse_line(line_text): |
+def parse_line(line_text, mode='body'): |
Sebastian Noack
2018/09/15 16:08:32
Having the "mode" as part of the public API, requi
Vasily Kuznetsov
2018/09/17 10:40:27
I would like to keep the mode in the public API. T
|
"""Parse one line of a filter list. |
- Note that parse_line() doesn't handle special comments, hence never returns |
- a Metadata() object, Adblock Plus only considers metadata when parsing the |
- whole filter list and only if they are given at the top of the filter list. |
+ The types of lines that that the parser recognizes depend on the mode. In |
+ body mode the parser only recognizes filters, comments, processing |
+ instructions and empty lines. In medata mode it in addition recognizes |
+ metadata. In start mode it also recognizes headers. |
+ |
+ Note: checksum metadata lines are recognized in all modes for backwards |
+ compatibility. Historically, checksums can occur at the bottom of the |
+ filter list. They are are no longer used by Adblock Plus, but in order to |
+ strip them (in abp.filters.renderer), we have to make sure to still parse |
+ them regardless of their position in the filter list. |
Parameters |
---------- |
line_text : str |
Line of a filter list. |
+ mode : str |
+ Parsing mode, one of "start", "metadata" or "body" (default). |
Returns |
------- |
namedtuple |
Parsed line (see `_line_type`). |
Raises |
------ |
ParseError |
ParseError: If the line can't be parsed. |
+ |
""" |
+ MODES = {'body', 'start', 'metadata'} |
+ if mode not in MODES: |
+ raise ValueError('mode should be one of {}'.format(MODES)) |
+ |
if isinstance(line_text, type(b'')): |
line_text = line_text.decode('utf-8') |
content = line_text.strip() |
Sebastian Noack
2018/09/15 16:08:32
Adblock Plus doesn't strip the line before process
Vasily Kuznetsov
2018/09/17 10:40:27
The behavior of ABP for the headers seems right. I
Sebastian Noack
2018/09/17 18:11:52
Adblock Plus extracts metadata (and the header) be
Vasily Kuznetsov
2018/09/18 12:41:14
Acknowledged.
|
if content == '': |
- line = EmptyLine() |
- elif content.startswith('!'): |
- line = Comment(content[1:].lstrip()) |
- elif content.startswith('%') and content.endswith('%'): |
- line = _parse_instruction(content) |
- elif content.startswith('[') and content.endswith(']'): |
- line = _parse_header(content) |
- else: |
- line = parse_filter(content) |
+ return EmptyLine() |
- assert line.to_string().replace(' ', '') == content.replace(' ', '') |
- return line |
+ if content.startswith('!'): |
+ match = METADATA_REGEXP.match(line_text) |
+ if match: |
+ key, value = match.groups() |
+ if mode != 'body' or key.lower() == 'checksum': |
Sebastian Noack
2018/09/15 16:08:32
We probably should keep the comment why we treat c
Vasily Kuznetsov
2018/09/17 10:40:27
I would like to also keep the note about checksums
Sebastian Noack
2018/09/17 18:11:52
I didn't notice that you moved that note to the do
Vasily Kuznetsov
2018/09/18 12:41:14
It needs to be in the docstring because it's part
|
+ return Metadata(key, value) |
+ return Comment(content[1:].lstrip()) |
+ |
+ if content.startswith('%') and content.endswith('%'): |
+ return _parse_instruction(content) |
+ |
+ if mode == 'start' and content.startswith('[') and content.endswith(']'): |
+ return _parse_header(content) |
+ |
+ return parse_filter(content) |
def parse_filterlist(lines): |
"""Parse filter list from an iterable. |
Parameters |
---------- |
lines: iterable of str |
@@ -309,30 +330,20 @@ |
Raises |
------ |
ParseError |
Thrown during iteration for invalid filter list lines. |
TypeError |
If `lines` is not iterable. |
""" |
- metadata_closed = False |
+ mode = 'start' |
Sebastian Noack
2018/09/17 18:11:52
Maybe "position" would be more accurate name for t
Vasily Kuznetsov
2018/09/18 12:41:14
Yeah, "position" is a better name. I changed it.
|
for line in lines: |
- result = parse_line(line) |
- |
- if result.type == 'comment': |
- match = METADATA_REGEXP.match(result.text) |
- if match: |
- key, value = match.groups() |
+ parsed_line = parse_line(line, mode) |
+ yield parsed_line |
- # Historically, checksums can occur at the bottom of the |
- # filter list. Checksums are no longer used by Adblock Plus, |
- # but in order to strip them (in abp.filters.renderer), |
- # we have to make sure to still parse them regardless of |
- # their position in the filter list. |
- if not metadata_closed or key.lower() == 'checksum': |
- result = Metadata(key, value) |
- |
- if result.type not in {'header', 'metadata'}: |
- metadata_closed = True |
- |
- yield result |
+ if mode != 'body' and parsed_line.type in {'header', 'metadata'}: |
+ # Continue parsing metadata if it's not over... |
+ mode = 'metadata' |
+ else: |
+ # ...otherwise switch to parsing filter list body. |
+ mode = 'body' |