Index: cms/converters.py |
=================================================================== |
--- a/cms/converters.py |
+++ b/cms/converters.py |
@@ -14,31 +14,33 @@ |
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
from __future__ import unicode_literals |
import os |
import HTMLParser |
import re |
import urlparse |
+import json |
import jinja2 |
import markdown |
# Monkey-patch Markdown's isBlockLevel function to ensure that no paragraphs |
# are inserted into the <head> tag |
orig_isBlockLevel = markdown.util.isBlockLevel |
def isBlockLevel(tag): |
if tag == 'head': |
return True |
return orig_isBlockLevel(tag) |
+ |
Sebastian Noack
2017/08/29 22:49:39
Adding this blank line is unrelated.
rosie
2018/03/26 02:32:21
True. My linter was showing a warning because ther
Sebastian Noack
2018/03/26 02:57:47
Strictly, there should be two blank lines surround
|
markdown.util.isBlockLevel = isBlockLevel |
html_escapes = { |
'<': '<', |
'>': '>', |
'&': '&', |
'"': '"', |
"'": ''', |
@@ -112,33 +114,54 @@ |
def handle_entityref(self, name): |
self._append_text(self.unescape('&{};'.format(name))) |
def handle_charref(self, name): |
self._append_text(self.unescape('&#{};'.format(name))) |
+def parse_metadata(page, data): |
+ metadata = {'page': page} |
+ try: |
+ decoder = json.JSONDecoder() |
+ json_data, index = decoder.raw_decode(data) |
+ metadata.update(json_data) |
+ page_data = data[index:].strip() |
Sebastian Noack
2017/08/29 22:49:39
Note that when parsing the legacy non-JSON format,
Vasily Kuznetsov
2017/08/30 10:50:00
This consideration is rather non-obvious, for exam
rosie
2018/03/26 02:32:21
(This function was moved to utils.py.) Now, the li
|
+ except ValueError: |
Sebastian Noack
2017/08/29 22:49:39
In case the data can be interpreted as JSON, but r
rosie
2018/03/26 02:32:21
The check to make sure a dict is returned now happ
|
+ lines = data.splitlines(True) |
+ for i, line in enumerate(lines): |
+ if not re.search(r'^\s*[\w\-]+\s*=', line): |
Sebastian Noack
2017/08/29 22:49:39
There is some redundancy in the regular expression
rosie
2018/03/26 02:32:20
Done.
|
+ break |
+ name, value = line.split('=', 1) |
+ value = value.strip() |
+ if value.startswith('[') and value.endswith(']'): |
+ value = [element.strip() for element in value[1:-1].split(',')] |
+ lines[i] = '' |
+ metadata[name.strip()] = value |
+ page_data = ''.join(lines) |
+ return metadata, page_data |
+ |
+ |
def parse_page_content(page, data): |
"""Separate page content into metadata (dict) and body text (str)""" |
- page_data = {'page': page} |
- lines = data.splitlines(True) |
- for i, line in enumerate(lines): |
- if line.strip() in {'<!--', '-->'}: |
- lines[i] = '' |
- continue |
- if not re.search(r'^\s*[\w\-]+\s*=', line): |
- break |
- name, value = line.split('=', 1) |
- value = value.strip() |
- if value.startswith('[') and value.endswith(']'): |
- value = [element.strip() for element in value[1:-1].split(',')] |
- lines[i] = '\n' |
- page_data[name.strip()] = value |
- return page_data, ''.join(lines) |
+ # If metadata is in a comment block, extract it |
+ comment_start = '<!--' |
Sebastian Noack
2017/08/29 22:49:39
It seems much simpler to use a regular expression
Vasily Kuznetsov
2017/08/30 10:50:00
I think it was my advice to not use regexps here b
rosie
2018/03/26 02:32:20
Done.
|
+ comment_end = '-->' |
+ if data.lstrip().startswith(comment_start): |
+ start_index = data.index(comment_start) + len(comment_start) |
+ end_index = data.index(comment_end) |
+ comment = data[start_index:end_index] |
+ page_tail = data[end_index + len(comment_end):] |
+ metadata, comment_data = parse_metadata(page, comment.strip()) |
Sebastian Noack
2017/08/29 22:49:39
Does stripping the comment have any effect here? I
Vasily Kuznetsov
2017/08/30 10:50:01
JSON parsing fails on leading space, but if we wan
rosie
2018/03/26 02:32:21
Done.
|
+ page_data = '{}\n{}\n{}\n\n{}'.format(comment_start, comment_data, |
+ comment_end, page_tail.strip()) |
+ else: |
+ metadata, page_data = parse_metadata(page, data.strip()) |
+ return metadata, page_data |
class Converter: |
whitelist = {'a', 'em', 'sup', 'strong', 'code', 'span'} |
missing_translations = 0 |
total_translations = 0 |
def __init__(self, params, key='pagedata'): |