Index: compiled/RegExpFilter.cpp |
=================================================================== |
new file mode 100644 |
--- /dev/null |
+++ b/compiled/RegExpFilter.cpp |
@@ -0,0 +1,307 @@ |
+#include <climits> |
+ |
+#include <emscripten.h> |
+ |
+#include "RegExpFilter.h" |
+#include "WhitelistFilter.h" |
+#include "InvalidFilter.h" |
+#include "StringScanner.h" |
+#include "StringMap.h" |
+ |
+namespace |
+{ |
+ enum |
+ { |
+ TYPE_OTHER = 0x1, |
+ TYPE_SCRIPT = 0x2, |
+ TYPE_IMAGE = 0x4, |
+ TYPE_STYLESHEET = 0x8, |
+ TYPE_OBJECT = 0x10, |
+ TYPE_SUBDOCUMENT = 0x20, |
+ TYPE_DOCUMENT = 0x40, |
+ TYPE_PING = 0x400, |
+ TYPE_XMLHTTPREQUEST = 0x800, |
+ TYPE_OBJECT_SUBREQUEST = 0x1000, |
+ TYPE_MEDIA = 0x4000, |
+ TYPE_FONT = 0x8000, |
+ TYPE_POPUP = 0x8000000, |
+ TYPE_GENERICBLOCK = 0x10000000, |
+ TYPE_GENERICHIDE = 0x20000000, |
+ TYPE_ELEMHIDE = 0x40000000, |
+ }; |
+ |
+ StringMap<int> typeMap { |
+ {u"other"_str, TYPE_OTHER}, |
+ {u"script"_str, TYPE_SCRIPT}, |
+ {u"image"_str, TYPE_IMAGE}, |
+ {u"stylesheet"_str, TYPE_STYLESHEET}, |
+ {u"object"_str, TYPE_OBJECT}, |
+ {u"subdocument"_str, TYPE_SUBDOCUMENT}, |
+ {u"document"_str, TYPE_DOCUMENT}, |
+ {u"xbl"_str, TYPE_OTHER}, // Backwards compat |
+ {u"ping"_str, TYPE_PING}, |
+ {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST}, |
+ {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST}, |
+ {u"dtd"_str, TYPE_OTHER}, // Backwards compat |
+ {u"media"_str, TYPE_MEDIA}, |
+ {u"font"_str, TYPE_FONT}, |
+ {u"background"_str, TYPE_IMAGE}, // Backwards compat |
+ |
+ {u"popup"_str, TYPE_POPUP}, |
+ {u"genericblock"_str, TYPE_GENERICBLOCK}, |
+ {u"generichide"_str, TYPE_GENERICHIDE}, |
+ {u"elemhide"_str, TYPE_ELEMHIDE}, |
+ }; |
+ |
+ int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | TYPE_POPUP | |
+ TYPE_GENERICBLOCK | TYPE_GENERICHIDE); |
+ |
+ int GenerateRegExp(const String& regexp, bool matchCase) |
+ { |
+ return EM_ASM_INT(return regexps.create($0, $1), ®exp, matchCase); |
+ } |
+} |
+ |
+RegExpFilter::RegExpFilter(const String& text, |
+ String::size_type patternStart, String::size_type patternEnd) |
+ : ActiveFilter(text, true), mRegexpId(0), |
+ mRegexpSource(String(mText, patternStart, patternEnd - patternStart)), |
+ mContentType(-1), mMatchCase(false), mThirdParty(TrippleState::ANY) |
+{ |
+ String options(mText, patternEnd + 1); |
+ StringScanner scanner(options, u','); |
+ int optionStart = 0; |
+ int optionEnd = -1; |
+ int valueStart = -1; |
+ bool done = false; |
+ while (!done) |
+ { |
+ done = scanner.done(); |
+ switch (scanner.next()) |
+ { |
+ case u'=': |
+ if (optionEnd < 0) |
+ { |
+ optionEnd = scanner.position(); |
+ valueStart = optionEnd + 1; |
+ } |
+ break; |
+ case u',': |
+ if (optionEnd < 0) |
+ optionEnd = scanner.position(); |
+ ProcessOption(options, optionStart, optionEnd, valueStart, scanner.position()); |
+ optionStart = scanner.position() + 1; |
+ optionEnd = -1; |
+ valueStart = -1; |
+ break; |
+ } |
+ } |
+ if (mContentType < 0) |
+ mContentType = defaultTypeMask; |
+ |
+ size_t len = mRegexpSource.length(); |
+ if (len >= 2 && mRegexpSource[0] == u'/' && mRegexpSource[len - 1] == u'/') |
+ { |
+ mRegexpSource.reset(mRegexpSource, 1 , len - 2); |
+ mRegexpId = GenerateRegExp(mRegexpSource, mMatchCase); |
+ |
+ int errorLength = EM_ASM_INT(return regexps.getErrorLength($0), mRegexpId); |
+ if (errorLength >= 0) |
+ { |
+ String error(errorLength); |
+ EM_ASM_ARGS(regexps.getError($0, $1), mRegexpId, error.data()); |
+ throw error; |
+ } |
+ } |
+} |
+ |
+RegExpFilter::~RegExpFilter() |
+{ |
+ if (mRegexpId) |
+ EM_ASM_ARGS(regexps.delete($0), mRegexpId); |
+} |
+ |
+void RegExpFilter::ProcessOption(String& options, int optionStart, |
+ int optionEnd, int valueStart, int valueEnd) |
+{ |
+ if (optionEnd <= optionStart) |
+ return; |
+ |
+ bool reverse = false; |
+ if (options[optionStart] == u'~') |
+ { |
+ reverse = true; |
+ optionStart++; |
+ } |
+ |
+ String name(options, optionStart, optionEnd - optionStart); |
+ for (size_t i = 0; i < name.length(); ++i) |
+ { |
+ char16_t currChar = name[i]; |
+ if (currChar >= u'A' && currChar <= u'Z') |
+ name[i] = currChar + u'a' - u'A'; |
+ else if (currChar == u'_') |
+ name[i] = u'-'; |
+ } |
+ |
+ auto it = typeMap.find(name); |
+ if (it != typeMap.end()) |
+ { |
+ if (mContentType < 0) |
+ mContentType = reverse ? defaultTypeMask : 0; |
+ if (reverse) |
+ mContentType &= ~it->second; |
+ else |
+ mContentType |= it->second; |
+ } |
+ else if (name.equals(u"domain"_str)) |
+ { |
+ if (valueStart >= 0 && valueEnd > valueStart) |
+ ParseDomains(options, valueStart, valueEnd - valueStart, u'|'); |
+ } |
+ else if (name.equals(u"sitekey"_str)) |
+ { |
+ if (valueStart >= 0 && valueEnd > valueStart) |
+ { |
+ StringScanner scanner(String(options, valueStart, valueEnd - valueStart), u'|'); |
+ size_t start = 0; |
+ bool done = false; |
+ while (!done) |
+ { |
+ done = scanner.done(); |
+ if (scanner.next() == u'|') |
+ { |
+ if (scanner.position() > start) |
+ AddSitekey(String(options, valueStart + start, scanner.position() - start)); |
+ start = scanner.position() + 1; |
+ } |
+ } |
+ } |
+ } |
+ else if (name.equals(u"match-case"_str)) |
+ mMatchCase = !reverse; |
+ else if (name.equals(u"third-party"_str)) |
+ mThirdParty = reverse ? TrippleState::NO : TrippleState::YES; |
+ else if (name.equals(u"collapse"_str)) |
+ mCollapse = reverse ? TrippleState::NO : TrippleState::YES; |
+ else |
+ { |
+ String error(u"Unknown option "_str); |
+ error.append(name); |
+ throw std::move(error.ensure_own_buffer()); |
+ } |
+} |
+ |
+Filter* RegExpFilter::Create(const String& text) |
+{ |
+ bool blocking = true; |
+ String::size_type patternStart = 0; |
+ if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@') |
+ { |
+ blocking = false; |
+ patternStart = 2; |
+ } |
+ |
+ String::size_type patternEnd = text.find(u'$', patternStart); |
+ if (patternEnd == text.npos) |
+ patternEnd = text.length(); |
+ |
+ try |
+ { |
+ if (blocking) |
+ return new RegExpFilter(text, patternStart, patternEnd); |
+ else |
+ return new WhitelistFilter(text, patternStart, patternEnd); |
+ } |
+ catch (const String& reason) |
+ { |
+ return new InvalidFilter(text, reason); |
+ } |
+} |
+ |
+void RegExpFilter::InitJSTypes() |
+{ |
+ EM_ASM(exports.RegExpFilter.typeMap = {};); |
+ for (auto it = typeMap.begin(); it != typeMap.end(); ++it) |
+ EM_ASM_ARGS(exports.RegExpFilter.typeMap[getStringData($0).replace("-", "_").toUpperCase()] = $1, &(it->first), it->second); |
+} |
+ |
+String RegExpFilter::RegExpFromSource(const String& source) |
+{ |
+ /* TODO: this is very inefficient */ |
+ |
+ // Note: This doesn't remove trailing wildcards, otherwise the result should |
+ // be identical to Filter.toRegExp(). |
+ String result; |
+ String::value_type prevChar = u'*'; |
+ for (String::size_type i = 0; i < source.length(); ++i) |
+ { |
+ String::value_type currChar = source[i]; |
+ switch (currChar) |
+ { |
+ case u'*': |
+ if (prevChar != u'*') |
+ result.append(u".*"_str); |
+ break; |
+ case u'^': |
+ result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x60\\x7B-\\x7F]|$)"_str); |
+ break; |
+ case u'|': |
+ if (i == 0) |
+ { |
+ // Anchor at expression start, maybe extended anchor? |
+ if (i + 1 < source.length() && source[i + 1] == u'|') |
+ { |
+ result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str); |
+ ++i; |
+ } |
+ else |
+ result.append(u'^'); |
+ } |
+ else if (i == source.length() - 1) |
+ { |
+ // Anchor at expression end, ignore if following separator placeholder |
+ if (prevChar != u'^') |
+ result.append(u'$'); |
+ } |
+ else |
+ { |
+ // Not actually an anchor, escape it |
+ result.append(u"\\|"_str); |
+ } |
+ break; |
+ default: |
+ if (!(currChar >= u'a' && currChar <= u'z') && |
+ !(currChar >= u'A' && currChar <= u'Z') && |
+ !(currChar >= u'0' && currChar <= u'9') && |
+ currChar < 128) |
+ { |
+ result.append(u'\\'); |
+ } |
+ result.append(currChar); |
+ } |
+ prevChar = currChar; |
+ } |
+ return std::move(result.ensure_own_buffer()); |
+} |
+ |
+Filter::Type RegExpFilter::GetType() const |
+{ |
+ return Type::BLOCKING; |
+} |
+ |
+bool RegExpFilter::Matches(const String& location, int typeMask, |
+ String& docDomain, bool thirdParty, const String& sitekey) const |
+{ |
+ if (!(mContentType & typeMask) || |
+ (mThirdParty == TrippleState::YES && !thirdParty) || |
+ (mThirdParty == TrippleState::NO && thirdParty) || |
+ !IsActiveOnDomain(docDomain, sitekey)) |
+ { |
+ return false; |
+ } |
+ |
+ if (!mRegexpId) |
+ mRegexpId = GenerateRegExp(RegExpFromSource(mRegexpSource), mMatchCase); |
+ return EM_ASM_INT(return regexps.test($0, $1), mRegexpId, &location); |
+} |