Left: | ||
Right: |
OLD | NEW |
---|---|
(Empty) | |
1 #include <climits> | |
2 | |
3 #include <emscripten.h> | |
4 | |
5 #include "RegExpFilter.h" | |
6 #include "StringScanner.h" | |
7 #include "StringMap.h" | |
8 | |
9 namespace | |
10 { | |
11 enum | |
12 { | |
13 TYPE_OTHER = 0x1, | |
14 TYPE_SCRIPT = 0x2, | |
15 TYPE_IMAGE = 0x4, | |
16 TYPE_STYLESHEET = 0x8, | |
17 TYPE_OBJECT = 0x10, | |
18 TYPE_SUBDOCUMENT = 0x20, | |
19 TYPE_DOCUMENT = 0x40, | |
20 TYPE_PING = 0x400, | |
21 TYPE_XMLHTTPREQUEST = 0x800, | |
22 TYPE_OBJECT_SUBREQUEST = 0x1000, | |
23 TYPE_MEDIA = 0x4000, | |
24 TYPE_FONT = 0x8000, | |
25 TYPE_POPUP = 0x8000000, | |
26 TYPE_GENERICBLOCK = 0x10000000, | |
27 TYPE_GENERICHIDE = 0x20000000, | |
28 TYPE_ELEMHIDE = 0x40000000, | |
29 }; | |
30 | |
31 StringMap<int> typeMap { | |
32 {u"other"_str, TYPE_OTHER}, | |
33 {u"script"_str, TYPE_SCRIPT}, | |
34 {u"image"_str, TYPE_IMAGE}, | |
35 {u"stylesheet"_str, TYPE_STYLESHEET}, | |
36 {u"object"_str, TYPE_OBJECT}, | |
37 {u"subdocument"_str, TYPE_SUBDOCUMENT}, | |
38 {u"document"_str, TYPE_DOCUMENT}, | |
39 {u"xbl"_str, TYPE_OTHER}, // Backwards compat | |
40 {u"ping"_str, TYPE_PING}, | |
41 {u"xmlhttprequest"_str, TYPE_XMLHTTPREQUEST}, | |
42 {u"object-subrequest"_str, TYPE_OBJECT_SUBREQUEST}, | |
43 {u"dtd"_str, TYPE_OTHER}, // Backwards compat | |
44 {u"media"_str, TYPE_MEDIA}, | |
45 {u"font"_str, TYPE_FONT}, | |
46 {u"background"_str, TYPE_IMAGE}, // Backwards compat | |
47 | |
48 {u"popup"_str, TYPE_POPUP}, | |
49 {u"genericblock"_str, TYPE_GENERICBLOCK}, | |
50 {u"generichide"_str, TYPE_GENERICHIDE}, | |
51 {u"elemhide"_str, TYPE_ELEMHIDE}, | |
52 }; | |
53 | |
54 const int defaultTypeMask = INT_MAX & ~(TYPE_DOCUMENT | TYPE_ELEMHIDE | | |
55 TYPE_POPUP | TYPE_GENERICBLOCK | TYPE_GENERICHIDE); | |
56 | |
57 int GenerateRegExp(const String& regexp, bool matchCase) | |
58 { | |
59 return EM_ASM_INT(return regexps.create($0, $1), ®exp, matchCase); | |
60 } | |
61 | |
62 void NormalizeWhitespace(DependentString& text) | |
63 { | |
64 // We want to remove all spaces but bail out early in the common scenario | |
65 // that the string contains no spaces. | |
66 | |
67 // Look for the first space | |
68 String::size_type len = text.length(); | |
69 String::size_type pos; | |
70 for (pos = 0; pos < len; pos++) | |
71 if (text[pos] == ' ') | |
72 break; | |
73 | |
74 if (pos >= len) | |
75 return; | |
76 | |
77 // Found spaces, move characters to remove them | |
78 String::size_type delta = 1; | |
79 for (pos = pos + 1; pos < len; pos++) | |
80 { | |
81 if (text[pos] == ' ') | |
82 delta++; | |
83 else | |
84 text[pos - delta] = text[pos]; | |
85 } | |
86 text.reset(text, 0, len - delta); | |
87 } | |
88 | |
89 void ParseOption(String& text, DependentString& error, RegExpFilterData& data, | |
90 int optionStart, int optionEnd, int valueStart, int valueEnd) | |
91 { | |
92 if (optionEnd <= optionStart) | |
93 return; | |
94 | |
95 bool reverse = false; | |
96 if (text[optionStart] == u'~') | |
97 { | |
98 reverse = true; | |
99 optionStart++; | |
100 } | |
101 | |
102 DependentString name(text, optionStart, optionEnd - optionStart); | |
103 for (size_t i = 0; i < name.length(); ++i) | |
104 { | |
105 char16_t currChar = name[i]; | |
106 if (currChar >= u'A' && currChar <= u'Z') | |
107 name[i] = currChar + u'a' - u'A'; | |
108 else if (currChar == u'_') | |
109 name[i] = u'-'; | |
110 } | |
111 | |
112 auto it = typeMap.find(name); | |
113 if (it) | |
114 { | |
115 if (data.mContentType < 0) | |
116 data.mContentType = reverse ? defaultTypeMask : 0; | |
117 if (reverse) | |
118 data.mContentType &= ~it->second; | |
119 else | |
120 data.mContentType |= it->second; | |
121 } | |
122 else if (name.equals(u"domain"_str)) | |
123 { | |
124 if (valueStart >= 0 && valueEnd > valueStart) | |
125 { | |
126 data.mDomainsStart = valueStart; | |
127 data.mDomainsEnd = valueEnd; | |
128 DependentString(text, valueStart, valueEnd - valueStart).tolower(); | |
129 } | |
130 } | |
131 else if (name.equals(u"sitekey"_str)) | |
132 { | |
133 if (valueStart >= 0 && valueEnd > valueStart) | |
134 { | |
135 data.mSitekeysStart = valueStart; | |
136 data.mSitekeysEnd = valueEnd; | |
137 } | |
138 } | |
139 else if (name.equals(u"match-case"_str)) | |
140 data.mMatchCase = !reverse; | |
141 else if (name.equals(u"third-party"_str)) | |
142 data.mThirdParty = reverse ? TrippleState::NO : TrippleState::YES; | |
143 else if (name.equals(u"collapse"_str)) | |
144 data.mCollapse = reverse ? TrippleState::NO : TrippleState::YES; | |
145 else | |
146 error.reset(u"filter_unknown_option"_str); | |
147 } | |
148 | |
149 void ParseOptions(String& text, DependentString& error, RegExpFilterData& data , | |
150 String::size_type optionsStart) | |
151 { | |
152 data.mMatchCase = false; | |
153 data.mThirdParty = TrippleState::ANY; | |
154 data.mCollapse = TrippleState::ANY; | |
155 data.mDomainsStart = String::npos; | |
156 data.mSitekeysStart = String::npos; | |
157 if (optionsStart >= text.length()) | |
158 { | |
159 data.mContentType = defaultTypeMask; | |
160 return; | |
161 } | |
162 | |
163 data.mContentType = -1; | |
164 | |
165 int optionStart = data.mPatternEnd + 1; | |
166 int optionEnd = -1; | |
167 int valueStart = -1; | |
168 | |
169 StringScanner scanner(text, optionStart, u','); | |
170 bool done = false; | |
171 while (!done) | |
172 { | |
173 done = scanner.done(); | |
174 switch (scanner.next()) | |
175 { | |
176 case u'=': | |
177 if (optionEnd < 0) | |
178 { | |
179 optionEnd = scanner.position(); | |
180 valueStart = optionEnd + 1; | |
181 } | |
182 break; | |
183 case u',': | |
184 if (optionEnd < 0) | |
185 optionEnd = scanner.position(); | |
186 ParseOption(text, error, data, optionStart, optionEnd, valueStart, | |
187 scanner.position()); | |
188 if (!error.empty()) | |
189 return; | |
190 | |
191 optionStart = scanner.position() + 1; | |
192 optionEnd = -1; | |
193 valueStart = -1; | |
194 break; | |
195 } | |
196 } | |
197 | |
198 if (data.mContentType < 0) | |
199 data.mContentType = defaultTypeMask; | |
200 } | |
201 } | |
202 | |
203 RegExpFilter::RegExpFilter(Type type, const String& text, const RegExpFilterData & data) | |
204 : ActiveFilter(type, text, true), mData(data) | |
205 { | |
206 } | |
207 | |
208 RegExpFilter::~RegExpFilter() | |
209 { | |
210 if (mData.HasRegExp()) | |
211 EM_ASM_ARGS(regexps.delete($0), mData.mRegexpId); | |
212 } | |
213 | |
214 Filter::Type RegExpFilter::Parse(DependentString& text, DependentString& error, | |
215 RegExpFilterData& data) | |
216 { | |
217 NormalizeWhitespace(text); | |
218 | |
219 bool blocking = true; | |
220 | |
221 data.mPatternStart = 0; | |
222 if (text.length() >= 2 && text[0] == u'@' && text[1] == u'@') | |
223 { | |
224 blocking = false; | |
225 data.mPatternStart = 2; | |
226 } | |
227 | |
228 data.mPatternEnd = text.find(u'$', data.mPatternStart); | |
229 if (data.mPatternEnd == text.npos) | |
230 data.mPatternEnd = text.length(); | |
231 | |
232 ParseOptions(text, error, data, data.mPatternEnd + 1); | |
233 if (!error.empty()) | |
234 return Type::INVALID; | |
235 | |
236 if (data.mPatternEnd - data.mPatternStart >= 2 && | |
237 text[data.mPatternStart] == u'/' && | |
238 text[data.mPatternEnd - 1] == u'/') | |
239 { | |
240 data.SetRegExp(GenerateRegExp(DependentString(text, data.mPatternStart + 1, | |
241 data.mPatternEnd - data.mPatternStart - 2), data.mMatchCase)); | |
242 if (data.mRegexpId == -1) | |
243 { | |
244 error.reset(u"filter_invalid_regexp"_str); | |
245 return Type::INVALID; | |
246 } | |
247 } | |
248 | |
249 if (blocking) | |
250 return Type::BLOCKING; | |
251 else | |
252 return Type::WHITELIST; | |
sergei
2017/01/10 15:57:50
It seems we could simply change type of blocking v
Wladimir Palant
2017/03/13 17:42:11
Done.
| |
253 } | |
254 | |
255 void RegExpFilter::ParseSitekeys(const String& sitekeys) const | |
256 { | |
257 StringScanner scanner(sitekeys, 0, u'|'); | |
258 size_t start = 0; | |
259 bool done = false; | |
260 while (!done) | |
261 { | |
262 done = scanner.done(); | |
263 if (scanner.next() == u'|') | |
264 { | |
265 if (scanner.position() > start) | |
266 AddSitekey(DependentString(sitekeys, start, scanner.position() - start)) ; | |
267 start = scanner.position() + 1; | |
268 } | |
269 } | |
270 } | |
271 | |
272 void RegExpFilter::InitJSTypes() | |
273 { | |
274 EM_ASM(exports.RegExpFilter.typeMap = {};); | |
275 for (auto it = typeMap.begin(); it != typeMap.end(); ++it) | |
276 EM_ASM_ARGS(exports.RegExpFilter.typeMap[readString($0).replace("-", "_").to UpperCase()] = $1, &(it->first), it->second); | |
277 } | |
278 | |
279 OwnedString RegExpFilter::RegExpFromSource(const String& source) | |
280 { | |
281 /* TODO: this is very inefficient */ | |
282 | |
283 // Note: This doesn't remove trailing wildcards, otherwise the result should | |
284 // be identical to Filter.toRegExp(). | |
285 OwnedString result; | |
286 String::value_type prevChar = u'*'; | |
287 for (String::size_type i = 0; i < source.length(); ++i) | |
288 { | |
289 String::value_type currChar = source[i]; | |
290 switch (currChar) | |
291 { | |
292 case u'*': | |
293 if (prevChar != u'*') | |
294 result.append(u".*"_str); | |
295 break; | |
296 case u'^': | |
297 result.append(u"(?:[\\x00-\\x24\\x26-\\x2C\\x2F\\x3A-\\x40\\x5B-\\x5E\\x 60\\x7B-\\x7F]|$)"_str); | |
298 break; | |
299 case u'|': | |
300 if (i == 0) | |
301 { | |
302 // Anchor at expression start, maybe extended anchor? | |
303 if (i + 1 < source.length() && source[i + 1] == u'|') | |
304 { | |
305 result.append(u"^[\\w\\-]+:\\/+(?!\\/)(?:[^\\/]+\\.)?"_str); | |
306 ++i; | |
307 } | |
308 else | |
309 result.append(u'^'); | |
310 } | |
311 else if (i == source.length() - 1) | |
312 { | |
313 // Anchor at expression end, ignore if following separator placeholder | |
314 if (prevChar != u'^') | |
315 result.append(u'$'); | |
316 } | |
317 else | |
318 { | |
319 // Not actually an anchor, escape it | |
320 result.append(u"\\|"_str); | |
321 } | |
322 break; | |
323 default: | |
324 if (!(currChar >= u'a' && currChar <= u'z') && | |
325 !(currChar >= u'A' && currChar <= u'Z') && | |
326 !(currChar >= u'0' && currChar <= u'9') && | |
327 currChar < 128) | |
328 { | |
329 result.append(u'\\'); | |
330 } | |
331 result.append(currChar); | |
332 } | |
333 prevChar = currChar; | |
334 } | |
335 return result; | |
336 } | |
337 | |
338 RegExpFilter::DomainMap* RegExpFilter::GetDomains() const | |
339 { | |
340 if (!mData.DomainsParsingDone()) | |
341 { | |
342 ParseDomains(mData.GetDomainsSource(mText), u'|'); | |
343 mData.SetDomainsParsingDone(); | |
344 } | |
345 return ActiveFilter::GetDomains(); | |
346 } | |
347 | |
348 RegExpFilter::SitekeySet* RegExpFilter::GetSitekeys() const | |
349 { | |
350 if (!mData.SitekeyParsingDone()) | |
351 { | |
352 ParseSitekeys(mData.GetSitekeysSource(mText)); | |
353 mData.SetSitekeysParsingDone(); | |
354 } | |
355 return ActiveFilter::GetSitekeys(); | |
356 } | |
357 | |
358 bool RegExpFilter::Matches(const String& location, int typeMask, | |
359 DependentString& docDomain, bool thirdParty, const String& sitekey) const | |
360 { | |
361 if (!(mData.mContentType & typeMask) || | |
362 (mData.mThirdParty == TrippleState::YES && !thirdParty) || | |
363 (mData.mThirdParty == TrippleState::NO && thirdParty) || | |
364 !IsActiveOnDomain(docDomain, sitekey)) | |
365 { | |
366 return false; | |
367 } | |
368 | |
369 if (!mData.RegExpParsingDone()) | |
370 { | |
371 const OwnedString pattern(mData.GetRegExpSource(mText)); | |
372 mData.SetRegExp(GenerateRegExp(RegExpFromSource(pattern), mData.mMatchCase)) ; | |
373 } | |
374 return EM_ASM_INT(return regexps.test($0, $1), mData.mRegexpId, &location); | |
375 } | |
OLD | NEW |