cms/utils.py - Issue 29516687: Issue 4488 - Add support for JSON page front matter

Side by Side Diff: cms/utils.py

Issue 29516687: Issue 4488 - Add support for JSON page front matter (Closed) Base URL: https://hg.adblockplus.org/cms

Patch Set: Preserve lines in metadata, fix sort in sitemap.tmpl Created March 26, 2018, 2:28 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # This file is part of the Adblock Plus web scripts,	1 # This file is part of the Adblock Plus web scripts,

2 # Copyright (C) 2006-present eyeo GmbH	2 # Copyright (C) 2006-present eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.	11 # GNU General Public License for more details.

12 #	12 #

13 # You should have received a copy of the GNU General Public License	13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15	15

16 import re	16 import re

	17 import json

17	18

18 __all__ = [	19 __all__ = [

19 'get_page_params',	20 'get_page_params',

20 'process_page',	21 'process_page',

21 'split_head_body',	22 'split_head_body',

22 'extract_page_metadata'	23 'extract_page_metadata'

23 ]	24 ]

24	25

25	26

26 def split_head_body(html):	27 def split_head_body(html):

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
58 ----------	59 ----------

59 source: str	60 source: str

60 Source text of the page.	61 Source text of the page.

61	62

62 Returns	63 Returns

63 -------	64 -------

64 (dict, str)	65 (dict, str)

65 Metadata of the page, remaining source text without metadata.	66 Metadata of the page, remaining source text without metadata.

66	67

67 """	68 """

68 metadata = {}	69 m = re.search(r'^\s<!--\s(.*?)-->', source, re.S)

69 lines = source.splitlines(True)	70 text = m.group(1) if m else source

70 for i, line in enumerate(lines):	71

71 if line.strip() in {'<!--', '-->'}:	72 decoder = json.JSONDecoder()

72 lines[i] = ''	73 try:

73 continue	74 metadata, length = decoder.raw_decode(text)

74 if not re.search(r'^\s[\w\-]+\s=', line):	75 except ValueError:

75 break	76 metadata = None

76 name, value = line.split('=', 1)	77

77 value = value.strip()	78 if not isinstance(metadata, dict):

78 if value.startswith('[') and value.endswith(']'):	79 metadata = {}

79 value = [element.strip() for element in value[1:-1].split(',')]	80 length = 0

80 lines[i] = '\n'	81 for line in text.splitlines(True):

81 metadata[name.strip()] = value	82 if not re.search(r'^\s[\w\-]+\s=', line):

82 return metadata, ''.join(lines)	83 break

	84 name, value = line.split('=', 1)

	85 value = value.strip()

	86 if value.startswith('[') and value.endswith(']'):

	87 value = [element.strip() for element in value[1:-1].split(',')]

	88 metadata[name.strip()] = value

	89 length += len(line)

	90

	91 # Need to preserve line numbers for jinja2 tracebacks

	92 cutoff = m.end() if m else length

	93 return metadata, '\n' * source.count('\n', 0, cutoff) + source[cutoff:]
	Sebastian Noack 2018/03/26 02:57:48 I just noticed one more edge case, if there is an I just noticed one more edge case, if there is an HTML comment at the beginning of the file, but it doesn't contain any metadata, we would still strip the comment. This could be accounted for with following addition to the logic here: if length > 0: cutoff = m.end() if m else length source = '\n' * source.count('\n', 0, cutoff) + source[cutoff:] return metadata, source Vasily Kuznetsov 2018/03/26 09:24:28 Good catch, Sebastian! It probably makes sense to Show quoted text On 2018/03/26 02:57:48, Sebastian Noack wrote: > I just noticed one more edge case, if there is an HTML comment at the beginning > of the file, but it doesn't contain any metadata, we would still strip the > comment. This could be accounted for with following addition to the logic here: > > if length > 0: > cutoff = m.end() if m else length > source = '\n' * source.count('\n', 0, cutoff) + source[cutoff:] > > return metadata, source Good catch, Sebastian! It probably makes sense to add a test for this too, to avoid future regressions. Jon Sonesen 2018/03/28 03:01:36 Perhaps a test case should be added as well to cov Show quoted text On 2018/03/26 02:57:48, Sebastian Noack wrote: > I just noticed one more edge case, if there is an HTML comment at the beginning > of the file, but it doesn't contain any metadata, we would still strip the > comment. This could be accounted for with following addition to the logic here: > > if length > 0: > cutoff = m.end() if m else length > source = '\n' * source.count('\n', 0, cutoff) + source[cutoff:] > > return metadata, source Perhaps a test case should be added as well to cover this? rosie 2018/03/31 21:09:28 I've implemented Sebastian's suggestion and it wor Show quoted text On 2018/03/28 03:01:36, Jon Sonesen wrote: > On 2018/03/26 02:57:48, Sebastian Noack wrote: > > I just noticed one more edge case, if there is an HTML comment at the > beginning > > of the file, but it doesn't contain any metadata, we would still strip the > > comment. This could be accounted for with following addition to the logic > here: > > > > if length > 0: > > cutoff = m.end() if m else length > > source = '\n' * source.count('\n', 0, cutoff) + source[cutoff:] > > > > return metadata, source > > Perhaps a test case should be added as well to cover this? I've implemented Sebastian's suggestion and it works as expected, but while trying to implement a test case for this, I ran into two issues. 1) A file with no specified template uses the default.tmpl to generate the page, but the default.tmpl file is blank, so a blank page is generated. I'm not sure if this is the intended behavior for this test site. 2) The sitemap.tmpl file seems to only include pages that have some metadata when creating the sitemap output, so this new test case would never show up on the sitemap. This may be out of scope for the current issue. Sebastian Noack 2018/03/31 22:07:35 So default.tmpl is empty, and empty.tmpl is not. T Show quoted text On 2018/03/31 21:09:28, rosie wrote: > I've implemented Sebastian's suggestion and it works as expected, but while > trying to implement a test case for this, I ran into two issues. > > 1) A file with no specified template uses the default.tmpl to generate the page, > but the default.tmpl file is blank, so a blank page is generated. I'm not sure > if this is the intended behavior for this test site. So default.tmpl is empty, and empty.tmpl is not. That looks like a mistake, or at very least it's extremely confusing. But I agree that cleaning this up seems out of scope of this change. Show quoted text > 2) The sitemap.tmpl file seems to only include pages that have some metadata > when creating the sitemap output, so this new test case would never show up on > the sitemap. This may be out of scope for the current issue. Apparently the get_pages_metadata() function called in sitemap.tmpl explicitly ignores any pages that don't define any custom metadata: https://hg.adblockplus.org/cms/file/b683e321a9d0/cms/converters.py#l477 I cannot tell if that is intended or not, but at very least the comment there gives no sufficient explanation. Vasily Kuznetsov 2018/04/03 14:29:19 I suppose we can say that it ended up being so for Show quoted text On 2018/03/31 22:07:35, Sebastian Noack wrote: > On 2018/03/31 21:09:28, rosie wrote: > > I've implemented Sebastian's suggestion and it works as expected, but while > > trying to implement a test case for this, I ran into two issues. > > > > 1) A file with no specified template uses the default.tmpl to generate the > page, > > but the default.tmpl file is blank, so a blank page is generated. I'm not sure > > if this is the intended behavior for this test site. > > So default.tmpl is empty, and empty.tmpl is not. That looks like a mistake, or > at very least it's extremely confusing. But I agree that cleaning this up seems > out of scope of this change. I suppose we can say that it ended up being so for historical reasons :) Anyway, apart from being confusing this seems to make testing of this case unnecessarily complicated. I think the best approach would be to swap the contents of `default.html` and `empty.tmpl` and adjust the rest of the test suite to make sure the output doesn't change. https://issues.adblockplus.org/ticket/6546 (and corresponding review https://codereview.adblockplus.org/29741581) do this, so perhaps we can just land that before this change. Show quoted text > > 2) The sitemap.tmpl file seems to only include pages that have some metadata > > when creating the sitemap output, so this new test case would never show up on > > the sitemap. This may be out of scope for the current issue. > > Apparently the get_pages_metadata() function called in sitemap.tmpl explicitly > ignores any pages that don't define any custom metadata: > https://hg.adblockplus.org/cms/file/b683e321a9d0/cms/converters.py#l477 > I cannot tell if that is intended or not, but at very least the comment there > gives no sufficient explanation. This looks like a bug to me. I've raised it in the review (https://codereview.adblockplus.org/29472555/) but then it seems we've forgot about it. I now created https://issues.adblockplus.org/ticket/6545 to fix this, but as far as I can see this doesn't block this issue. rosie 2018/04/18 02:35:32 Acknowledged. Show quoted text On 2018/04/03 14:29:19, Vasily Kuznetsov wrote: > On 2018/03/31 22:07:35, Sebastian Noack wrote: > > On 2018/03/31 21:09:28, rosie wrote: > > > I've implemented Sebastian's suggestion and it works as expected, but while > > > trying to implement a test case for this, I ran into two issues. > > > > > > 1) A file with no specified template uses the default.tmpl to generate the > > page, > > > but the default.tmpl file is blank, so a blank page is generated. I'm not > sure > > > if this is the intended behavior for this test site. > > > > So default.tmpl is empty, and empty.tmpl is not. That looks like a mistake, or > > at very least it's extremely confusing. But I agree that cleaning this up > seems > > out of scope of this change. > > I suppose we can say that it ended up being so for historical reasons :) Anyway, > apart from being confusing this seems to make testing of this case unnecessarily > complicated. I think the best approach would be to swap the contents of > `default.html` and `empty.tmpl` and adjust the rest of the test suite to make > sure the output doesn't change. https://issues.adblockplus.org/ticket/6546 (and > corresponding review https://codereview.adblockplus.org/29741581) do this, so > perhaps we can just land that before this change. > > > > 2) The sitemap.tmpl file seems to only include pages that have some metadata > > > when creating the sitemap output, so this new test case would never show up > on > > > the sitemap. This may be out of scope for the current issue. > > > > Apparently the get_pages_metadata() function called in sitemap.tmpl explicitly > > ignores any pages that don't define any custom metadata: > > https://hg.adblockplus.org/cms/file/b683e321a9d0/cms/converters.py#l477 > > I cannot tell if that is intended or not, but at very least the comment there > > gives no sufficient explanation. > > This looks like a bug to me. I've raised it in the review > (https://codereview.adblockplus.org/29472555/) but then it seems we've forgot > about it. I now created https://issues.adblockplus.org/ticket/6545 to fix this, > but as far as I can see this doesn't block this issue. Acknowledged.
83	94

84	95

85 def get_page_params(source, locale, page, format=None, site_url_override=None,	96 def get_page_params(source, locale, page, format=None, site_url_override=None,

86 localized_string_callback=None):	97 localized_string_callback=None):

87 from cms.converters import converters	98 from cms.converters import converters

88	99

89 # Guess page format if omitted, but default to Markdown for friendlier excep tions	100 # Guess page format if omitted, but default to Markdown for friendlier excep tions

90 if format is None:	101 if format is None:

91 for format in converters.iterkeys():	102 for format in converters.iterkeys():

92 if source.has_page(page, format):	103 if source.has_page(page, format):

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
153 return params	164 return params

154	165

155	166

156 def process_page(source, locale, page, format=None, site_url_override=None,	167 def process_page(source, locale, page, format=None, site_url_override=None,

157 localized_string_callback=None):	168 localized_string_callback=None):

158 from cms.converters import TemplateConverter	169 from cms.converters import TemplateConverter

159	170

160 params = get_page_params(source, locale, page, format, site_url_override,	171 params = get_page_params(source, locale, page, format, site_url_override,

161 localized_string_callback)	172 localized_string_callback)

162 return TemplateConverter(*params['templatedata'], params=params)()	173 return TemplateConverter(*params['templatedata'], params=params)()

OLD	NEW

« .pytest_cache/v/cache/nodeids ('K') | « .pytest_cache/v/cache/nodeids ('k') | tests/expected_output/en/metadata_json » ('j') | tests/test_site/pages/sitemap.tmpl » ('J')