OLD | NEW |
(Empty) | |
| 1 #include <climits> |
| 2 |
| 3 #include <emscripten.h> |
| 4 |
| 5 #include "RegExpFilter.h" |
| 6 #include "StringScanner.h" |
| 7 #include "StringMap.h" |
| 8 |
| 9 namespace |
| 10 { |
| 11 enum |
| 12 { |
| 13 TYPE_OTHER = 0x1, |
| 14 TYPE_SCRIPT = 0x2, |
| 15 TYPE_IMAGE = 0x4, |
| 16 TYPE_STYLESHEET = 0x8, |
| 17 TYPE_OBJECT = 0x10, |
| 18 TYPE_SUBDOCUMENT = 0x20, |
| 19 TYPE_DOCUMENT = 0x40, |
| 20 TYPE_PING = 0x400, |
| 21 TYPE_XMLHTTPREQUEST = 0x800, |
| 22 TYPE_OBJECT_SUBREQUEST = 0x1000, |
| 23 TYPE_MEDIA = 0x4000, |
| 24 TYPE_FONT = 0x8000, |
| 25 TYPE_POPUP = 0x8000000, |
| 26 TYPE_GENERICBLOCK = 0x10000000, |
| 27 TYPE_GENERICHIDE = 0x20000000, |
| 28 TYPE_ELEMHIDE = 0x40000000, |
| 29 }; |
| 30 |
| 31 StringMap<int> typeMap { |
| 32 {u"other"_str, TYPE_OTHER}, |
| 33 {u"script"_str, TYPE_SCRIPT}, |
| 34 {u"image"_str, TYPE_IMAGE}, |
| 35 {u"stylesheet"_str, TYPE_STYLESHEET}, |
| 36 {u"object"_str, TYPE_OBJECT}, |
| 37 {u"subdocument"_str, TYPE_SUBDOCUMENT}, |
| 38 {u"document"_str, TYPE_DOCUMENT}, |
| 39 {u"xbl"_str, TYPE_OTHER}, // Backwards compat |
| 40 {u"ping"_str, TYPE_PING}, |
| 41 {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST}, |
| 42 {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST}, |
| 43 {u"dtd"_str, TYPE_OTHER}, // Backwards compat |
| 44 {u"media"_str, TYPE_MEDIA}, |
| 45 {u"font"_str, TYPE_FONT}, |
| 46 {u"background"_str, TYPE_IMAGE}, // Backwards compat |
| 47 |
| 48 {u"popup"_str, TYPE_POPUP}, |
| 49 {u"genericblock"_str, TYPE_GENERICBLOCK}, |
| 50 {u"generichide"_str, TYPE_GENERICHIDE}, |
| 51 {u"elemhide"_str, TYPE_ELEMHIDE}, |
| 52 }; |
| 53 |
| 54 int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | TYPE_POPUP | |
| 55 TYPE_GENERICBLOCK | TYPE_GENERICHIDE); |
| 56 |
| 57 int GenerateRegExp(const String& regexp, bool matchCase) |
| 58 { |
| 59 return EM_ASM_INT(return regexps.create($0, $1), ®exp, matchCase); |
| 60 } |
| 61 |
| 62 void NormalizeWhitespace(String& text) |
| 63 { |
| 64 // We want to remove all spaces but bail out early in the common scenario |
| 65 // that the string contains no spaces. |
| 66 |
| 67 // Look for the first space |
| 68 String::size_type len = text.length(); |
| 69 String::size_type pos; |
| 70 for (pos = 0; pos < len; pos++) |
| 71 if (text[pos] == ' ') |
| 72 break; |
| 73 |
| 74 if (pos >= len) |
| 75 return; |
| 76 |
| 77 // Found spaces, move characters to remove them |
| 78 String::size_type delta = 1; |
| 79 for (pos = pos + 1; pos < len; pos++) |
| 80 { |
| 81 if (text[pos] == ' ') |
| 82 delta++; |
| 83 else |
| 84 text[pos - delta] = text[pos]; |
| 85 } |
| 86 text.reset(text, 0, len - delta); |
| 87 } |
| 88 } |
| 89 |
| 90 RegExpFilter::RegExpFilter(const String& text, const RegExpFilterData& data) |
| 91 : ActiveFilter(text, true), RegExpFilterData(data) |
| 92 { |
| 93 } |
| 94 |
| 95 RegExpFilter::~RegExpFilter() |
| 96 { |
| 97 if (HasRegExp()) |
| 98 EM_ASM_ARGS(regexps.delete($0), mRegexpId); |
| 99 } |
| 100 |
| 101 Filter::Type RegExpFilter::Parse(String& text, String& error, |
| 102 RegExpFilterData& data) |
| 103 { |
| 104 NormalizeWhitespace(text); |
| 105 |
| 106 bool blocking = true; |
| 107 |
| 108 data.mPatternStart = 0; |
| 109 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@') |
| 110 { |
| 111 blocking = false; |
| 112 data.mPatternStart = 2; |
| 113 } |
| 114 |
| 115 data.mPatternEnd = text.find(u'$', data.mPatternStart); |
| 116 if (data.mPatternEnd == text.npos) |
| 117 data.mPatternEnd = text.length(); |
| 118 |
| 119 ParseOptions(text, error, data, data.mPatternEnd + 1); |
| 120 if (!error.empty()) |
| 121 return Type::INVALID; |
| 122 |
| 123 if (data.mPatternEnd - data.mPatternStart >= 2 && |
| 124 text[data.mPatternStart] == u'/' && |
| 125 text[data.mPatternEnd - 1] == u'/') |
| 126 { |
| 127 data.SetRegExp(GenerateRegExp(String(text, data.mPatternStart + 1, |
| 128 data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase)); |
| 129 |
| 130 int errorLength = EM_ASM_INT(return regexps.getErrorLength($0), |
| 131 data.mRegexpId); |
| 132 if (errorLength >= 0) |
| 133 { |
| 134 String regexpError(errorLength); |
| 135 EM_ASM_ARGS(regexps.getError($0, $1), data.mRegexpId, regexpError.data()); |
| 136 error.reset(std::move(regexpError)); |
| 137 return Type::INVALID; |
| 138 } |
| 139 } |
| 140 |
| 141 if (blocking) |
| 142 return Type::BLOCKING; |
| 143 else |
| 144 return Type::WHITELIST; |
| 145 } |
| 146 |
| 147 void RegExpFilter::ParseOptions(String& text, String& error, |
| 148 RegExpFilterData& data, String::size_type optionsStart) |
| 149 { |
| 150 data.mMatchCase = false; |
| 151 data.mThirdParty = TrippleState::ANY; |
| 152 data.mCollapse = TrippleState::ANY; |
| 153 data.mDomainsStart = String::npos; |
| 154 data.mSitekeysStart = String::npos; |
| 155 if (optionsStart >= text.length()) |
| 156 { |
| 157 data.mContentType = defaultTypeMask; |
| 158 return; |
| 159 } |
| 160 |
| 161 data.mContentType = -1; |
| 162 |
| 163 int optionStart = data.mPatternEnd + 1; |
| 164 int optionEnd = -1; |
| 165 int valueStart = -1; |
| 166 |
| 167 StringScanner scanner(text, optionStart, u','); |
| 168 bool done = false; |
| 169 while (!done) |
| 170 { |
| 171 done = scanner.done(); |
| 172 switch (scanner.next()) |
| 173 { |
| 174 case u'=': |
| 175 if (optionEnd < 0) |
| 176 { |
| 177 optionEnd = scanner.position(); |
| 178 valueStart = optionEnd + 1; |
| 179 } |
| 180 break; |
| 181 case u',': |
| 182 if (optionEnd < 0) |
| 183 optionEnd = scanner.position(); |
| 184 ParseOption(text, error, data, optionStart, optionEnd, valueStart, |
| 185 scanner.position()); |
| 186 if (!error.empty()) |
| 187 return; |
| 188 |
| 189 optionStart = scanner.position() + 1; |
| 190 optionEnd = -1; |
| 191 valueStart = -1; |
| 192 break; |
| 193 } |
| 194 } |
| 195 |
| 196 if (data.mContentType < 0) |
| 197 data.mContentType = defaultTypeMask; |
| 198 } |
| 199 |
| 200 void RegExpFilter::ParseOption(String& text, String& error, |
| 201 RegExpFilterData& data, int optionStart, int optionEnd, int valueStart, |
| 202 int valueEnd) |
| 203 { |
| 204 if (optionEnd <= optionStart) |
| 205 return; |
| 206 |
| 207 bool reverse = false; |
| 208 if (text[optionStart] == u'~') |
| 209 { |
| 210 reverse = true; |
| 211 optionStart++; |
| 212 } |
| 213 |
| 214 String name(text, optionStart, optionEnd - optionStart); |
| 215 for (size_t i = 0; i < name.length(); ++i) |
| 216 { |
| 217 char16_t currChar = name[i]; |
| 218 if (currChar >= u'A' && currChar <= u'Z') |
| 219 name[i] = currChar + u'a' - u'A'; |
| 220 else if (currChar == u'_') |
| 221 name[i] = u'-'; |
| 222 } |
| 223 |
| 224 auto it = typeMap.find(name); |
| 225 if (it != typeMap.end()) |
| 226 { |
| 227 if (data.mContentType < 0) |
| 228 data.mContentType = reverse ? defaultTypeMask : 0; |
| 229 if (reverse) |
| 230 data.mContentType &= ~it->second; |
| 231 else |
| 232 data.mContentType |= it->second; |
| 233 } |
| 234 else if (name.equals(u"domain"_str)) |
| 235 { |
| 236 if (valueStart >= 0 && valueEnd > valueStart) |
| 237 { |
| 238 data.mDomainsStart = valueStart; |
| 239 data.mDomainsEnd = valueEnd; |
| 240 ToLower(text, data.mDomainsStart, data.mDomainsEnd); |
| 241 } |
| 242 } |
| 243 else if (name.equals(u"sitekey"_str)) |
| 244 { |
| 245 if (valueStart >= 0 && valueEnd > valueStart) |
| 246 { |
| 247 data.mSitekeysStart = valueStart; |
| 248 data.mSitekeysEnd = valueEnd; |
| 249 } |
| 250 } |
| 251 else if (name.equals(u"match-case"_str)) |
| 252 data.mMatchCase = !reverse; |
| 253 else if (name.equals(u"third-party"_str)) |
| 254 data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES; |
| 255 else if (name.equals(u"collapse"_str)) |
| 256 data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES; |
| 257 else |
| 258 { |
| 259 error.reset(u"Unknown option "_str); |
| 260 error.append(name); |
| 261 } |
| 262 } |
| 263 |
| 264 void RegExpFilter::ParseSitekeys(const String& sitekeys) const |
| 265 { |
| 266 StringScanner scanner(sitekeys, 0, u'|'); |
| 267 size_t start = 0; |
| 268 bool done = false; |
| 269 while (!done) |
| 270 { |
| 271 done = scanner.done(); |
| 272 if (scanner.next() == u'|') |
| 273 { |
| 274 if (scanner.position() > start) |
| 275 AddSitekey(String(sitekeys, start, scanner.position() - start)); |
| 276 start = scanner.position() + 1; |
| 277 } |
| 278 } |
| 279 } |
| 280 |
| 281 void RegExpFilter::InitJSTypes() |
| 282 { |
| 283 EM_ASM(exports.RegExpFilter.typeMap = {};); |
| 284 for (auto it = typeMap.begin(); it != typeMap.end(); ++it) |
| 285 EM_ASM_ARGS(exports.RegExpFilter.typeMap[getStringData($0).replace("-", "_")
.toUpperCase()] = $1, &(it->first), it->second); |
| 286 } |
| 287 |
| 288 String RegExpFilter::RegExpFromSource(const String& source) |
| 289 { |
| 290 /* TODO: this is very inefficient */ |
| 291 |
| 292 // Note: This doesn't remove trailing wildcards, otherwise the result should |
| 293 // be identical to Filter.toRegExp(). |
| 294 String result; |
| 295 String::value_type prevChar = u'*'; |
| 296 for (String::size_type i = 0; i < source.length(); ++i) |
| 297 { |
| 298 String::value_type currChar = source[i]; |
| 299 switch (currChar) |
| 300 { |
| 301 case u'*': |
| 302 if (prevChar != u'*') |
| 303 result.append(u".*"_str); |
| 304 break; |
| 305 case u'^': |
| 306 result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x
60\\x7B-\\x7F]|$)"_str); |
| 307 break; |
| 308 case u'|': |
| 309 if (i == 0) |
| 310 { |
| 311 // Anchor at expression start, maybe extended anchor? |
| 312 if (i + 1 < source.length() && source[i + 1] == u'|') |
| 313 { |
| 314 result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str); |
| 315 ++i; |
| 316 } |
| 317 else |
| 318 result.append(u'^'); |
| 319 } |
| 320 else if (i == source.length() - 1) |
| 321 { |
| 322 // Anchor at expression end, ignore if following separator placeholder |
| 323 if (prevChar != u'^') |
| 324 result.append(u'$'); |
| 325 } |
| 326 else |
| 327 { |
| 328 // Not actually an anchor, escape it |
| 329 result.append(u"\\|"_str); |
| 330 } |
| 331 break; |
| 332 default: |
| 333 if (!(currChar >= u'a' && currChar <= u'z') && |
| 334 !(currChar >= u'A' && currChar <= u'Z') && |
| 335 !(currChar >= u'0' && currChar <= u'9') && |
| 336 currChar < 128) |
| 337 { |
| 338 result.append(u'\\'); |
| 339 } |
| 340 result.append(currChar); |
| 341 } |
| 342 prevChar = currChar; |
| 343 } |
| 344 return std::move(result.ensure_own_buffer()); |
| 345 } |
| 346 |
| 347 Filter::Type RegExpFilter::GetType() const |
| 348 { |
| 349 return Type::BLOCKING; |
| 350 } |
| 351 |
| 352 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const |
| 353 { |
| 354 if (!DomainsParsingDone()) |
| 355 { |
| 356 ParseDomains(GetDomainsSource(mText), u'|'); |
| 357 SetDomainsParsingDone(); |
| 358 } |
| 359 return ActiveFilter::GetDomains(); |
| 360 } |
| 361 |
| 362 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const |
| 363 { |
| 364 if (!SitekeyParsingDone()) |
| 365 { |
| 366 ParseSitekeys(GetSitekeysSource(mText)); |
| 367 SetSitekeysParsingDone(); |
| 368 } |
| 369 return ActiveFilter::GetSitekeys(); |
| 370 } |
| 371 |
| 372 bool RegExpFilter::Matches(const String& location, int typeMask, |
| 373 String& docDomain, bool thirdParty, const String& sitekey) const |
| 374 { |
| 375 if (!(mContentType & typeMask) || |
| 376 (mThirdParty == TrippleState::YES && !thirdParty) || |
| 377 (mThirdParty == TrippleState::NO && thirdParty) || |
| 378 !IsActiveOnDomain(docDomain, sitekey)) |
| 379 { |
| 380 return false; |
| 381 } |
| 382 |
| 383 if (!RegExpParsingDone()) |
| 384 { |
| 385 const String pattern(GetRegExpSource(mText)); |
| 386 SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mMatchCase)); |
| 387 } |
| 388 return EM_ASM_INT(return regexps.test($0, $1), mRegexpId, &location); |
| 389 } |
OLD | NEW |