Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 /* | 1 /* |
2 * This Source Code is subject to the terms of the Mozilla Public License | 2 * This Source Code is subject to the terms of the Mozilla Public License |
3 * version 2.0 (the "License"). You can obtain a copy of the License at | 3 * version 2.0 (the "License"). You can obtain a copy of the License at |
4 * http://mozilla.org/MPL/2.0/. | 4 * http://mozilla.org/MPL/2.0/. |
5 */ | 5 */ |
6 | 6 |
7 "use strict"; | |
8 | |
7 /** | 9 /** |
8 * @module crawler | 10 * @module crawler |
9 */ | 11 */ |
10 | 12 |
11 Cu.import("resource://gre/modules/Services.jsm"); | 13 const {Services} = Cu.import("resource://gre/modules/Services.jsm", {}); |
12 Cu.import("resource://gre/modules/Task.jsm"); | 14 const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {}); |
13 Cu.import("resource://gre/modules/Promise.jsm"); | 15 const {Task} = Cu.import("resource://gre/modules/Task.jsm", {}); |
14 Cu.import("resource://gre/modules/Timer.jsm"); | 16 const {setTimeout, clearTimeout} = Cu.import("resource://gre/modules/Timer.jsm", {}); |
15 | 17 |
16 function abprequire(module) | 18 function abprequire(module) |
17 { | 19 { |
18 let result = {}; | 20 let result = {}; |
19 result.wrappedJSObject = result; | 21 result.wrappedJSObject = result; |
20 Services.obs.notifyObservers(result, "adblockplus-require", module); | 22 Services.obs.notifyObservers(result, "adblockplus-require", module); |
21 return result.exports; | 23 return result.exports; |
22 } | 24 } |
23 | 25 |
24 let {RequestNotifier} = abprequire("requestNotifier"); | 26 let {RequestNotifier} = abprequire("requestNotifier"); |
25 let {FilterNotifier} = abprequire("filterNotifier"); | 27 let {FilterNotifier} = abprequire("filterNotifier"); |
26 let {FilterStorage} = abprequire("filterStorage"); | 28 let {FilterStorage} = abprequire("filterStorage"); |
27 | 29 |
28 /** | 30 /** |
29 * Creates a pool of tabs and allocates them to tasks on request. | 31 * Allocates tabs on request but not more than maxtabs at the same time. |
30 * | 32 * |
31 * @param {tabbrowser} browser | 33 * @param {tabbrowser} browser |
32 * The tabbed browser where tabs should be created | 34 * The tabbed browser where tabs should be created |
33 * @param {int} maxtabs | 35 * @param {int} maxtabs |
34 * The maximum number of tabs to be allocated | 36 * The maximum number of tabs to be allocated |
35 * @constructor | 37 * @constructor |
36 */ | 38 */ |
37 function TabAllocator(browser, maxtabs) | 39 function TabAllocator(browser, maxtabs) |
38 { | 40 { |
39 browser.removeAllTabsBut(browser.tabs[0]) | 41 this._browser = browser; |
40 | 42 this._tabs = 0; |
41 this._tabs = []; | 43 this._maxtabs = maxtabs; |
42 for (let i = 0; i < maxtabs; i++) | 44 // The queue containing resolve functions of promises waiting for a tab. |
43 this._tabs.push(browser.addTab("about:blank")); | 45 this._resolvers = []; |
44 | 46 // Keep at least one tab alive to prevent browser from closing itself. |
45 browser.removeTab(browser.tabs[0]); | 47 this._tabKeepingWindowAlive = this._browser.tabs[0]; |
46 | 48 this._browser.removeAllTabsBut(this._tabKeepingWindowAlive); |
47 this._deferred = []; | |
48 } | 49 } |
49 TabAllocator.prototype = { | 50 TabAllocator.prototype = { |
51 _removeTabKeepingWindowAlive: function() | |
52 { | |
53 if (!this._tabKeepingWindowAlive) | |
54 return; | |
55 this._browser.removeTab(this._tabKeepingWindowAlive); | |
56 delete this._tabKeepingWindowAlive; | |
57 }, | |
58 | |
50 /** | 59 /** |
51 * Returns a promise that will resolve into a tab once a tab can be allocated. | 60 * Creates a blank tab in this._browser. |
61 * | |
62 * @return {Promise.<tab>} promise which resolves once the tab is fully initia lized. | |
63 */ | |
64 _createTab: function() | |
65 { | |
66 this._tabs++; | |
67 let tab = this._browser.addTab("about:blank"); | |
68 if (tab.linkedBrowser.outerWindowID) | |
69 { | |
70 this._removeTabKeepingWindowAlive(); | |
71 return Promise.resolve(tab); | |
72 } | |
73 return new Promise((resolve, reject) => | |
74 { | |
75 let onBrowserInit = (msg) => | |
76 { | |
77 tab.linkedBrowser.messageManager.removeMessageListener("Browser:Init", o nBrowserInit); | |
78 this._removeTabKeepingWindowAlive(); | |
79 resolve(tab); | |
80 }; | |
81 // "Browser:Init" message is sent once the browser is ready, see | |
82 // https://bugzil.la/1256602#c1 | |
83 tab.linkedBrowser.messageManager.addMessageListener("Browser:Init", onBrow serInit); | |
84 }); | |
85 }, | |
86 | |
87 /** | |
88 * Returns a promise that will resolve into a tab once a tab is allocated. | |
52 * The tab cannot be used by other tasks until releaseTab() is called. | 89 * The tab cannot be used by other tasks until releaseTab() is called. |
53 * | 90 * |
54 * @result {Promise} | 91 * @result {Promise.<tab>} |
55 */ | 92 */ |
56 getTab: function() | 93 getTab: function() |
57 { | 94 { |
58 if (this._tabs.length) | 95 if (this._tabs < this._maxtabs) |
59 return this._tabs.shift(); | 96 return this._createTab(); |
60 else | 97 return new Promise((resolve, reject) => this._resolvers.push(resolve)); |
61 { | |
62 let deferred = Promise.defer(); | |
63 this._deferred.push(deferred); | |
64 return deferred.promise; | |
65 } | |
66 }, | 98 }, |
67 | 99 |
68 /** | 100 /** |
69 * Adds a tab back to the pool so that it can be used by other tasks. | 101 * Adds a tab back to the pool so that it can be used by other tasks. |
70 * | 102 * |
71 * @param {tab} tab | 103 * @param {tab} tab |
72 */ | 104 */ |
73 releaseTab: function(tab) | 105 releaseTab: function(tab) |
74 { | 106 { |
75 let browser = tab.parentNode.tabbrowser; | 107 // If we are about to close last tab don't close it immediately to keep |
76 browser.removeTab(tab); | 108 // the window alive. It will be closed when a new tab is created. |
77 tab = browser.addTab("about:blank"); | 109 if (this._tabs > 1) |
78 | 110 this._browser.removeTab(tab); |
79 if (this._deferred.length) | |
80 this._deferred.shift().resolve(tab); | |
81 else | 111 else |
82 this._tabs.push(tab); | 112 { |
83 } | 113 // navigate away from previously opened URL |
114 tab.linkedBrowser.loadURI("about:blank", null, null); | |
115 this._tabKeepingWindowAlive = tab; | |
116 } | |
117 | |
118 this._tabs--; | |
119 if (this._resolvers.length && this._tabs < this._maxtabs) | |
120 { | |
121 this._resolvers.shift()(this._createTab()); | |
122 } | |
123 }, | |
84 }; | 124 }; |
85 | 125 |
86 /** | 126 /** |
87 * Once created, this object will make sure all new windows are dismissed | 127 * Once created, this object will make sure all new windows are dismissed |
88 * immediately. | 128 * immediately. |
89 * | 129 * |
90 * @constructor | 130 * @constructor |
91 */ | 131 */ |
92 function WindowCloser() | 132 function WindowCloser() |
93 { | 133 { |
(...skipping 16 matching lines...) Expand all Loading... | |
110 { | 150 { |
111 if (window.document.documentElement.localName == 'dialog') | 151 if (window.document.documentElement.localName == 'dialog') |
112 window.document.documentElement.acceptDialog(); | 152 window.document.documentElement.acceptDialog(); |
113 else | 153 else |
114 window.close(); | 154 window.close(); |
115 }, false); | 155 }, false); |
116 }, | 156 }, |
117 | 157 |
118 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence]) | 158 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence]) |
119 }; | 159 }; |
160 | |
161 function configureFrameScript() | |
162 { | |
163 const info = require("info"); | |
164 let frameScriptPath = info.addonRoot + "/lib/child/frameScript.js"; | |
165 Services.mm.loadFrameScript(frameScriptPath, true); | |
166 | |
167 onShutdown.add(() => | |
168 { | |
169 Services.mm.removeDelayedFrameScript(frameScriptPath); | |
170 }); | |
171 } | |
120 | 172 |
121 /** | 173 /** |
122 * Starts the crawling session. The crawler opens each URL in a tab and stores | 174 * Starts the crawling session. The crawler opens each URL in a tab and stores |
123 * the results. | 175 * the results. |
124 * | 176 * |
125 * @param {Window} window | 177 * @param {Window} window |
126 * The browser window we're operating in | 178 * The browser window we're operating in |
127 * @param {String[]} urls | 179 * @param {String[]} urls |
128 * URLs to be crawled | 180 * URLs to be crawled |
129 * @param {int} number_of_tabs | 181 * @param {int} timeout |
182 * Load timeout in milliseconds | |
183 * @param {int} maxtabs | |
130 * Maximum number of tabs to be opened | 184 * Maximum number of tabs to be opened |
131 * @param {String} targetURL | 185 * @param {String} targetURL |
132 * URL that should receive the results | 186 * URL that should receive the results |
187 * @param {Function} onDone | |
188 * The callback which is called after finishing of crawling of all URLs. | |
133 */ | 189 */ |
134 function run(window, urls, timeout, maxtabs, targetURL, onDone) | 190 function run(window, urls, timeout, maxtabs, targetURL, onDone) |
191 { | |
192 configureFrameScript(); | |
193 new Promise((resolve, reject) => | |
194 { | |
195 if (FilterStorage.subscriptions.length > 0) | |
196 { | |
197 resolve(); | |
198 return; | |
199 } | |
200 let onFiltersLoaded = (action, item, newValue, oldValue) => | |
201 { | |
202 if (action == "load") | |
203 { | |
204 FilterNotifier.removeListener(onFiltersLoaded); | |
205 resolve(); | |
206 } | |
207 }; | |
208 FilterNotifier.addListener(onFiltersLoaded); | |
209 }).then(() => crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)) | |
210 .catch(reportException); | |
211 } | |
212 exports.run = run; | |
213 | |
214 /** | |
215 * Spawns a {Task} task to crawl each url from urls argument and calls | |
216 * onDone when all tasks are finished. | |
217 * @param {Window} window | |
218 * The browser window we're operating in | |
219 * @param {String[]} urls | |
220 * URLs to be crawled | |
221 * @param {int} timeout | |
222 * Load timeout in milliseconds | |
223 * @param {int} maxtabs | |
224 * Maximum number of tabs to be opened | |
225 * @param {String} targetURL | |
226 * URL that should receive the results | |
227 * @param {Function} onDone | |
228 * The callback which is called after finishing of all tasks. | |
229 */ | |
230 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) | |
135 { | 231 { |
136 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); | 232 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); |
137 | 233 |
138 let running = 0; | 234 let running = 0; |
139 let windowCloser = new WindowCloser(); | 235 let windowCloser = new WindowCloser(); |
140 let taskDone = function() | 236 let taskDone = function() |
141 { | 237 { |
142 running--; | 238 running--; |
143 if (running <= 0) | 239 if (running <= 0) |
144 { | 240 { |
145 windowCloser.stop(); | 241 windowCloser.stop(); |
146 onDone(); | 242 onDone(); |
147 } | 243 } |
148 }; | 244 }; |
149 | 245 |
150 new Promise(function(resolve, reject) | 246 for (let url of urls) |
151 { | 247 { |
152 if (FilterStorage.subscriptions.length > 0 && !FilterStorage._loading) | 248 running++; |
153 { | 249 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(r esult) |
154 resolve(); | 250 { |
155 return; | 251 let request = new XMLHttpRequest(); |
156 } | 252 request.open("POST", targetURL); |
157 FilterNotifier.addListener((action, item, newValue, oldValue) => | 253 request.addEventListener("load", taskDone, false); |
158 { | 254 request.addEventListener("error", taskDone, false); |
159 if (action === "load") | 255 request.send(JSON.stringify(result)); |
160 { | 256 }, function(url, exception) |
161 resolve(); | 257 { |
162 } | 258 reportException(exception); |
163 }); | 259 |
164 }).then(_ => | 260 let request = new XMLHttpRequest(); |
165 { | 261 request.open("POST", targetURL); |
166 for (let url of urls) | 262 request.addEventListener("load", taskDone, false); |
167 { | 263 request.addEventListener("error", taskDone, false); |
168 running++; | 264 request.send(JSON.stringify({ |
169 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function (result) | 265 url: url, |
170 { | 266 startTime: Date.now(), |
171 let request = new XMLHttpRequest(); | 267 error: String(exception) |
172 request.open("POST", targetURL); | 268 })); |
173 request.addEventListener("load", taskDone, false); | 269 }.bind(null, url)); |
174 request.addEventListener("error", taskDone, false); | 270 } |
175 request.send(JSON.stringify(result)); | 271 } |
176 }, function(url, exception) | 272 |
177 { | 273 /** |
178 reportException(exception); | 274 * Expects to receive page info gathered in a content process for the specified |
179 | 275 * `tab`. If there is no relevant message within specified `timeout` then |
180 let request = new XMLHttpRequest(); | 276 * the result promise is resolved with error object. |
181 request.open("POST", targetURL); | 277 * @param tab |
182 request.addEventListener("load", taskDone, false); | 278 * Tab in which we are interested in |
183 request.addEventListener("error", taskDone, false); | 279 * @param {int} timeout |
184 request.send(JSON.stringify({ | 280 * Timeout in milliseconds |
185 url: url, | 281 * @return {Promise} promise which will be resolved with the received page info |
186 startTime: Date.now(), | 282 */ |
187 error: String(exception) | 283 function getPageInfo(tab, timeout) |
188 })); | 284 { |
189 }.bind(null, url)); | 285 return new Promise((resolve, result) => |
190 } | 286 { |
191 // Be careful, `catch` does not catch exeptions from this `then` handler becau se | 287 let mm = tab.linkedBrowser.messageManager; |
192 // the latter one does not return an array of promises of asynchrounous tasks | 288 let timerID; |
193 // and does not contain any waiting code. | 289 let onDone = (msg) => |
194 }).catch(reportException); | 290 { |
195 } | 291 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone); |
196 exports.run = run; | 292 clearTimeout(timerID); |
293 resolve(msg.data); | |
294 } | |
295 mm.addMessageListener("abpcrawler:pageInfoGathered", onDone); | |
296 timerID = setTimeout(() => onDone({data: {error: "timeout"}}), timeout); | |
297 }); | |
298 } | |
197 | 299 |
198 /** | 300 /** |
199 * Crawls a URL. This is a generator meant to be used via a Task object. | 301 * Crawls a URL. This is a generator meant to be used via a Task object. |
200 * | 302 * |
201 * @param {String} url | 303 * @param {String} url |
202 * @param {TabAllocator} tabAllocator | 304 * @param {TabAllocator} tabAllocator |
305 * @param {int} timeout | |
306 * Load timeout in milliseconds | |
203 * @result {Object} | 307 * @result {Object} |
204 * Crawling result | 308 * Crawling result |
205 */ | 309 */ |
206 function* crawl_url(url, tabAllocator, timeout) | 310 function* crawl_url(url, tabAllocator, timeout) |
207 { | 311 { |
208 let tab = yield tabAllocator.getTab(); | 312 let tab = yield tabAllocator.getTab(); |
209 let result = {url, requests: []}; | 313 let result = {url, requests: []}; |
210 | 314 let requestNotifier; |
211 try | 315 try |
212 { | 316 { |
213 result.startTime = Date.now(); | 317 result.startTime = Date.now(); |
214 let requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, f unction({type, location, filter}, scanComplete) | 318 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, |
sergei
2016/03/15 16:40:10
BTW, in addition, this part stops to work, I have
sergei
2016/03/16 14:44:23
https://issues.adblockplus.org/ticket/3815
| |
215 { | 319 function(entry, scanComplete) |
216 result.requests.push({location, contentType: type, filter}); | 320 { |
321 if (!entry) | |
322 return; | |
323 let {type: contentType, location, filter} = entry; | |
324 result.requests.push({location, contentType, filter}); | |
217 }); | 325 }); |
218 | 326 |
219 tab.linkedBrowser.loadURI(url, null, null); | 327 tab.linkedBrowser.loadURI(url, null, null); |
220 | 328 |
221 let mm = tab.linkedBrowser.messageManager; | 329 Object.assign(result, yield getPageInfo(tab, timeout)); |
222 let pageInfoFuture = new Promise((resolve, result) => | |
223 { | |
224 let timerID; | |
225 let onDone = (pageInfo) => | |
226 { | |
227 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone); | |
Wladimir Palant
2016/03/15 10:07:10
So, which tab did you get the page info for?
The
sergei
2016/03/16 14:44:23
For the `tab`, it's "browser message manager" it a
| |
228 clearTimeout(timerID); | |
229 resolve(pageInfo); | |
230 } | |
231 mm.addMessageListener("abpcrawler:pageInfoGathered", (msg) => onDone(msg.d ata));; | |
232 timerID = setTimeout(onDone.bind(this, {error: "timeout"}), timeout); | |
233 }); | |
Wladimir Palant
2016/03/15 10:07:10
Please move this functionality into a separate fun
sergei
2016/03/16 14:44:23
Done.
| |
234 | |
235 let pageInfo = yield pageInfoFuture; | |
236 | |
237 result.finalUrl = tab.linkedBrowser.currentURI.spec; | 330 result.finalUrl = tab.linkedBrowser.currentURI.spec; |
238 Object.assign(result, pageInfo); | |
239 result.endTime = Date.now(); | 331 result.endTime = Date.now(); |
240 } | 332 } |
241 finally | 333 finally |
242 { | 334 { |
335 if (requestNotifier) | |
336 requestNotifier.shutdown(); | |
243 tabAllocator.releaseTab(tab); | 337 tabAllocator.releaseTab(tab); |
244 } | 338 } |
245 return result; | 339 return result; |
246 } | 340 } |
247 | 341 |
248 function reportException(e) | 342 function reportException(e) |
249 { | 343 { |
250 let stack = ""; | 344 let stack = ""; |
251 if (e && typeof e == "object" && "stack" in e) | 345 if (e && typeof e == "object" && "stack" in e) |
252 stack = e.stack + "\n"; | 346 stack = e.stack + "\n"; |
253 | 347 |
254 Cu.reportError(e); | 348 Cu.reportError(e); |
255 dump(e + "\n" + stack + "\n"); | 349 dump(e + "\n" + stack + "\n"); |
256 } | 350 } |
257 | |
258 let {addonRoot} = require("info"); | |
259 let frameScriptPath = addonRoot + "/lib/child/frameScript.js"; | |
260 let globalMessageManager = Services.mm; | |
261 globalMessageManager.loadFrameScript(frameScriptPath, true); | |
Wladimir Palant
2016/03/15 10:07:10
This should be a process script, no point using a
| |
262 | |
263 let onReportException = function(msg) | |
264 { | |
265 reportException(msg.objects); | |
Wladimir Palant
2016/03/15 10:07:10
Please don't use msg.objects - ever. That's a wrap
sergei
2016/03/16 14:44:23
Acknowledged. Actually, I wanted to avoid duplicat
sergei
2016/03/16 14:44:23
I know, I used `msg.objects` because we don't know
Wladimir Palant
2016/09/14 16:11:46
Worst-case scenario: deadlocks because all of that
| |
266 } | |
267 globalMessageManager.addMessageListener("abpcrawler:reportException", onReportEx ception); | |
268 | |
269 onShutdown.add(() => | |
270 { | |
271 globalMessageManager.removeMessageListener("abpcrawler:reportException", onRep ortException); | |
272 globalMessageManager.removeDelayedFrameScript(frameScriptPath); | |
273 }); | |
LEFT | RIGHT |