cms/converters.py - Issue 29516687: Issue 4488 - Add support for JSON page front matter

Unified Diff: cms/converters.py

Issue 29516687: Issue 4488 - Add support for JSON page front matter (Closed) Base URL: https://hg.adblockplus.org/cms

Patch Set: Cleaned up duplication, removed unnecessary regex Created Aug. 23, 2017, 6:12 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: cms/converters.py

===================================================================

--- a/cms/converters.py

+++ b/cms/converters.py

@@ -14,31 +14,33 @@

# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

from __future__ import unicode_literals

import os

import HTMLParser

import re

import urlparse

+import json

import jinja2

import markdown

# Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs

# are inserted into the <head> tag

orig_isBlockLevel = markdown.util.isBlockLevel

def isBlockLevel(tag):

if tag == 'head':

return True

return orig_isBlockLevel(tag)

Sebastian Noack 2017/08/29 22:49:39 Adding this blank line is unrelated.

rosie 2018/03/26 02:32:21 True. My linter was showing a warning because ther

Sebastian Noack 2018/03/26 02:57:47 Strictly, there should be two blank lines surround

markdown.util.isBlockLevel = isBlockLevel

html_escapes = {

'<': '<',

'>': '>',

'&': '&',

'"': '"',

"'": ''',

@@ -112,33 +114,54 @@

def handle_entityref(self, name):

self._append_text(self.unescape('&{};'.format(name)))

def handle_charref(self, name):

self._append_text(self.unescape('&#{};'.format(name)))

+def parse_metadata(page, data):

+ metadata = {'page': page}

+ try:

+ decoder = json.JSONDecoder()

+ json_data, index = decoder.raw_decode(data)

+ metadata.update(json_data)

+ page_data = data[index:].strip()

Sebastian Noack 2017/08/29 22:49:39 Note that when parsing the legacy non-JSON format,

Vasily Kuznetsov 2017/08/30 10:50:00 This consideration is rather non-obvious, for exam

rosie 2018/03/26 02:32:21 (This function was moved to utils.py.) Now, the li

+ except ValueError:

Sebastian Noack 2017/08/29 22:49:39 In case the data can be interpreted as JSON, but r

rosie 2018/03/26 02:32:21 The check to make sure a dict is returned now happ

+ lines = data.splitlines(True)

+ for i, line in enumerate(lines):

+ if not re.search(r'^\s*[\w\-]+\s*=', line):

Sebastian Noack 2017/08/29 22:49:39 There is some redundancy in the regular expression

rosie 2018/03/26 02:32:20 Done.

+ break

+ name, value = line.split('=', 1)

+ value = value.strip()

+ if value.startswith('[') and value.endswith(']'):

+ value = [element.strip() for element in value[1:-1].split(',')]

+ lines[i] = ''

+ metadata[name.strip()] = value

+ page_data = ''.join(lines)

+ return metadata, page_data

def parse_page_content(page, data):

"""Separate page content into metadata (dict) and body text (str)"""

- page_data = {'page': page}

- lines = data.splitlines(True)

- for i, line in enumerate(lines):

- if line.strip() in {''}:

- lines[i] = ''

- continue

- if not re.search(r'^\s*[\w\-]+\s*=', line):

- break

- name, value = line.split('=', 1)

- value = value.strip()

- if value.startswith('[') and value.endswith(']'):

- value = [element.strip() for element in value[1:-1].split(',')]

- lines[i] = '\n'

- page_data[name.strip()] = value

- return page_data, ''.join(lines)

+ # If metadata is in a comment block, extract it

+ comment_start = '<!--'

Sebastian Noack 2017/08/29 22:49:39 It seems much simpler to use a regular expression

Vasily Kuznetsov 2017/08/30 10:50:00 I think it was my advice to not use regexps here b

rosie 2018/03/26 02:32:20 Done.

+ comment_end = '-->'

+ if data.lstrip().startswith(comment_start):

+ start_index = data.index(comment_start) + len(comment_start)

+ end_index = data.index(comment_end)

+ comment = data[start_index:end_index]

+ page_tail = data[end_index + len(comment_end):]

+ metadata, comment_data = parse_metadata(page, comment.strip())

Sebastian Noack 2017/08/29 22:49:39 Does stripping the comment have any effect here? I

Vasily Kuznetsov 2017/08/30 10:50:01 JSON parsing fails on leading space, but if we wan

rosie 2018/03/26 02:32:21 Done.

+ page_data = '{}\n{}\n{}\n\n{}'.format(comment_start, comment_data,

+ comment_end, page_tail.strip())

+ else:

+ metadata, page_data = parse_metadata(page, data.strip())

+ return metadata, page_data

class Converter:

whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'}

missing_translations = 0

total_translations = 0

def __init__(self, params, key='pagedata'):

« no previous file with comments | « no previous file | tests/expected_output/en/metadata_json » ('j') | tests/expected_output/en/sitemap » ('J')