Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: lib/crawler.js

Issue 29338242: Issue 3792 - Fix to support multiprocess firefox (Closed)
Left Patch Set: Created March 14, 2016, 2:41 p.m.
Right Patch Set: change comment Created Sept. 30, 2016, 12:43 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « lib/child/frameScript.js ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 /* 1 /*
2 * This Source Code is subject to the terms of the Mozilla Public License 2 * This Source Code is subject to the terms of the Mozilla Public License
3 * version 2.0 (the "License"). You can obtain a copy of the License at 3 * version 2.0 (the "License"). You can obtain a copy of the License at
4 * http://mozilla.org/MPL/2.0/. 4 * http://mozilla.org/MPL/2.0/.
5 */ 5 */
6 6
7 "use strict";
8
7 /** 9 /**
8 * @module crawler 10 * @module crawler
9 */ 11 */
10 12
11 Cu.import("resource://gre/modules/Services.jsm"); 13 const {Services} = Cu.import("resource://gre/modules/Services.jsm", {});
12 Cu.import("resource://gre/modules/Task.jsm"); 14 const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {});
13 Cu.import("resource://gre/modules/Promise.jsm"); 15 const {Task} = Cu.import("resource://gre/modules/Task.jsm", {});
14 Cu.import("resource://gre/modules/Timer.jsm"); 16 const {setTimeout, clearTimeout} = Cu.import("resource://gre/modules/Timer.jsm", {});
15 17
16 function abprequire(module) 18 function abprequire(module)
17 { 19 {
18 let result = {}; 20 let result = {};
19 result.wrappedJSObject = result; 21 result.wrappedJSObject = result;
20 Services.obs.notifyObservers(result, "adblockplus-require", module); 22 Services.obs.notifyObservers(result, "adblockplus-require", module);
21 return result.exports; 23 return result.exports;
22 } 24 }
23 25
24 let {RequestNotifier} = abprequire("requestNotifier"); 26 let {RequestNotifier} = abprequire("requestNotifier");
25 let {FilterNotifier} = abprequire("filterNotifier"); 27 let {FilterNotifier} = abprequire("filterNotifier");
26 let {FilterStorage} = abprequire("filterStorage"); 28 let {FilterStorage} = abprequire("filterStorage");
27 29
28 /** 30 /**
29 * Creates a pool of tabs and allocates them to tasks on request. 31 * Allocates tabs on request but not more than maxtabs at the same time.
30 * 32 *
31 * @param {tabbrowser} browser 33 * @param {tabbrowser} browser
32 * The tabbed browser where tabs should be created 34 * The tabbed browser where tabs should be created
33 * @param {int} maxtabs 35 * @param {int} maxtabs
34 * The maximum number of tabs to be allocated 36 * The maximum number of tabs to be allocated
35 * @constructor 37 * @constructor
36 */ 38 */
37 function TabAllocator(browser, maxtabs) 39 function TabAllocator(browser, maxtabs)
38 { 40 {
39 browser.removeAllTabsBut(browser.tabs[0]) 41 this._browser = browser;
40 42 this._tabs = 0;
41 this._tabs = []; 43 this._maxtabs = maxtabs;
42 for (let i = 0; i < maxtabs; i++) 44 // The queue containing resolve functions of promises waiting for a tab.
43 this._tabs.push(browser.addTab("about:blank")); 45 this._resolvers = [];
44 46 // Keep at least one tab alive to prevent browser from closing itself.
45 browser.removeTab(browser.tabs[0]); 47 this._tabKeepingWindowAlive = this._browser.tabs[0];
46 48 this._browser.removeAllTabsBut(this._tabKeepingWindowAlive);
47 this._deferred = [];
48 } 49 }
49 TabAllocator.prototype = { 50 TabAllocator.prototype = {
51 _removeTabKeepingWindowAlive: function()
52 {
53 if (!this._tabKeepingWindowAlive)
54 return;
55 this._browser.removeTab(this._tabKeepingWindowAlive);
56 delete this._tabKeepingWindowAlive;
57 },
58
50 /** 59 /**
51 * Returns a promise that will resolve into a tab once a tab can be allocated. 60 * Creates a blank tab in this._browser.
61 *
62 * @return {Promise.<tab>} promise which resolves once the tab is fully initia lized.
63 */
64 _createTab: function()
65 {
66 this._tabs++;
67 let tab = this._browser.addTab("about:blank");
68 if (tab.linkedBrowser.outerWindowID)
69 {
70 this._removeTabKeepingWindowAlive();
71 return Promise.resolve(tab);
72 }
73 return new Promise((resolve, reject) =>
74 {
75 let onBrowserInit = (msg) =>
76 {
77 tab.linkedBrowser.messageManager.removeMessageListener("Browser:Init", o nBrowserInit);
78 this._removeTabKeepingWindowAlive();
79 resolve(tab);
80 };
81 // "Browser:Init" message is sent once the browser is ready, see
82 // https://bugzil.la/1256602#c1
83 tab.linkedBrowser.messageManager.addMessageListener("Browser:Init", onBrow serInit);
84 });
85 },
86
87 /**
88 * Returns a promise that will resolve into a tab once a tab is allocated.
52 * The tab cannot be used by other tasks until releaseTab() is called. 89 * The tab cannot be used by other tasks until releaseTab() is called.
53 * 90 *
54 * @result {Promise} 91 * @result {Promise.<tab>}
55 */ 92 */
56 getTab: function() 93 getTab: function()
57 { 94 {
58 if (this._tabs.length) 95 if (this._tabs < this._maxtabs)
59 return this._tabs.shift(); 96 return this._createTab();
60 else 97 return new Promise((resolve, reject) => this._resolvers.push(resolve));
61 {
62 let deferred = Promise.defer();
63 this._deferred.push(deferred);
64 return deferred.promise;
65 }
66 }, 98 },
67 99
68 /** 100 /**
69 * Adds a tab back to the pool so that it can be used by other tasks. 101 * Adds a tab back to the pool so that it can be used by other tasks.
70 * 102 *
71 * @param {tab} tab 103 * @param {tab} tab
72 */ 104 */
73 releaseTab: function(tab) 105 releaseTab: function(tab)
74 { 106 {
75 let browser = tab.parentNode.tabbrowser; 107 // If we are about to close last tab don't close it immediately to keep
76 browser.removeTab(tab); 108 // the window alive. It will be closed when a new tab is created.
77 tab = browser.addTab("about:blank"); 109 if (this._tabs > 1)
78 110 this._browser.removeTab(tab);
79 if (this._deferred.length)
80 this._deferred.shift().resolve(tab);
81 else 111 else
82 this._tabs.push(tab); 112 {
83 } 113 // navigate away from previously opened URL
114 tab.linkedBrowser.loadURI("about:blank", null, null);
115 this._tabKeepingWindowAlive = tab;
116 }
117
118 this._tabs--;
119 if (this._resolvers.length && this._tabs < this._maxtabs)
120 {
121 this._resolvers.shift()(this._createTab());
122 }
123 },
84 }; 124 };
85 125
86 /** 126 /**
87 * Once created, this object will make sure all new windows are dismissed 127 * Once created, this object will make sure all new windows are dismissed
88 * immediately. 128 * immediately.
89 * 129 *
90 * @constructor 130 * @constructor
91 */ 131 */
92 function WindowCloser() 132 function WindowCloser()
93 { 133 {
(...skipping 16 matching lines...) Expand all
110 { 150 {
111 if (window.document.documentElement.localName == 'dialog') 151 if (window.document.documentElement.localName == 'dialog')
112 window.document.documentElement.acceptDialog(); 152 window.document.documentElement.acceptDialog();
113 else 153 else
114 window.close(); 154 window.close();
115 }, false); 155 }, false);
116 }, 156 },
117 157
118 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence]) 158 QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakRefer ence])
119 }; 159 };
160
161 function configureFrameScript()
162 {
163 const info = require("info");
164 let frameScriptPath = info.addonRoot + "/lib/child/frameScript.js";
165 Services.mm.loadFrameScript(frameScriptPath, true);
166
167 onShutdown.add(() =>
168 {
169 Services.mm.removeDelayedFrameScript(frameScriptPath);
170 });
171 }
120 172
121 /** 173 /**
122 * Starts the crawling session. The crawler opens each URL in a tab and stores 174 * Starts the crawling session. The crawler opens each URL in a tab and stores
123 * the results. 175 * the results.
124 * 176 *
125 * @param {Window} window 177 * @param {Window} window
126 * The browser window we're operating in 178 * The browser window we're operating in
127 * @param {String[]} urls 179 * @param {String[]} urls
128 * URLs to be crawled 180 * URLs to be crawled
129 * @param {int} number_of_tabs 181 * @param {int} timeout
182 * Load timeout in milliseconds
183 * @param {int} maxtabs
130 * Maximum number of tabs to be opened 184 * Maximum number of tabs to be opened
131 * @param {String} targetURL 185 * @param {String} targetURL
132 * URL that should receive the results 186 * URL that should receive the results
187 * @param {Function} onDone
188 * The callback which is called after finishing of crawling of all URLs.
133 */ 189 */
134 function run(window, urls, timeout, maxtabs, targetURL, onDone) 190 function run(window, urls, timeout, maxtabs, targetURL, onDone)
191 {
192 configureFrameScript();
193 new Promise((resolve, reject) =>
194 {
195 if (FilterStorage.subscriptions.length > 0)
196 {
197 resolve();
198 return;
199 }
200 let onFiltersLoaded = (action, item, newValue, oldValue) =>
201 {
202 if (action == "load")
203 {
204 FilterNotifier.removeListener(onFiltersLoaded);
205 resolve();
206 }
207 };
208 FilterNotifier.addListener(onFiltersLoaded);
209 }).then(() => crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone))
210 .catch(reportException);
211 }
212 exports.run = run;
213
214 /**
215 * Spawns a {Task} task to crawl each url from urls argument and calls
216 * onDone when all tasks are finished.
217 * @param {Window} window
218 * The browser window we're operating in
219 * @param {String[]} urls
220 * URLs to be crawled
221 * @param {int} timeout
222 * Load timeout in milliseconds
223 * @param {int} maxtabs
224 * Maximum number of tabs to be opened
225 * @param {String} targetURL
226 * URL that should receive the results
227 * @param {Function} onDone
228 * The callback which is called after finishing of all tasks.
229 */
230 function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)
135 { 231 {
136 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); 232 let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
137 233
138 let running = 0; 234 let running = 0;
139 let windowCloser = new WindowCloser(); 235 let windowCloser = new WindowCloser();
140 let taskDone = function() 236 let taskDone = function()
141 { 237 {
142 running--; 238 running--;
143 if (running <= 0) 239 if (running <= 0)
144 { 240 {
145 windowCloser.stop(); 241 windowCloser.stop();
146 onDone(); 242 onDone();
147 } 243 }
148 }; 244 };
149 245
150 new Promise(function(resolve, reject) 246 for (let url of urls)
151 { 247 {
152 if (FilterStorage.subscriptions.length > 0 && !FilterStorage._loading) 248 running++;
153 { 249 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(r esult)
154 resolve(); 250 {
155 return; 251 let request = new XMLHttpRequest();
156 } 252 request.open("POST", targetURL);
157 FilterNotifier.addListener((action, item, newValue, oldValue) => 253 request.addEventListener("load", taskDone, false);
158 { 254 request.addEventListener("error", taskDone, false);
159 if (action === "load") 255 request.send(JSON.stringify(result));
160 { 256 }, function(url, exception)
161 resolve(); 257 {
162 } 258 reportException(exception);
163 }); 259
164 }).then(_ => 260 let request = new XMLHttpRequest();
165 { 261 request.open("POST", targetURL);
166 for (let url of urls) 262 request.addEventListener("load", taskDone, false);
167 { 263 request.addEventListener("error", taskDone, false);
168 running++; 264 request.send(JSON.stringify({
169 Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function (result) 265 url: url,
170 { 266 startTime: Date.now(),
171 let request = new XMLHttpRequest(); 267 error: String(exception)
172 request.open("POST", targetURL); 268 }));
173 request.addEventListener("load", taskDone, false); 269 }.bind(null, url));
174 request.addEventListener("error", taskDone, false); 270 }
175 request.send(JSON.stringify(result)); 271 }
176 }, function(url, exception) 272
177 { 273 /**
178 reportException(exception); 274 * Expects to receive page info gathered in a content process for the specified
179 275 * `tab`. If there is no relevant message within specified `timeout` then
180 let request = new XMLHttpRequest(); 276 * the result promise is resolved with error object.
181 request.open("POST", targetURL); 277 * @param tab
182 request.addEventListener("load", taskDone, false); 278 * Tab in which we are interested in
183 request.addEventListener("error", taskDone, false); 279 * @param {int} timeout
184 request.send(JSON.stringify({ 280 * Timeout in milliseconds
185 url: url, 281 * @return {Promise} promise which will be resolved with the received page info
186 startTime: Date.now(), 282 */
187 error: String(exception) 283 function getPageInfo(tab, timeout)
188 })); 284 {
189 }.bind(null, url)); 285 return new Promise((resolve, result) =>
190 } 286 {
191 // Be careful, `catch` does not catch exeptions from this `then` handler becau se 287 let mm = tab.linkedBrowser.messageManager;
192 // the latter one does not return an array of promises of asynchrounous tasks 288 let timerID;
193 // and does not contain any waiting code. 289 let onDone = (msg) =>
194 }).catch(reportException); 290 {
195 } 291 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone);
196 exports.run = run; 292 clearTimeout(timerID);
293 resolve(msg.data);
294 }
295 mm.addMessageListener("abpcrawler:pageInfoGathered", onDone);
296 timerID = setTimeout(() => onDone({data: {error: "timeout"}}), timeout);
297 });
298 }
197 299
198 /** 300 /**
199 * Crawls a URL. This is a generator meant to be used via a Task object. 301 * Crawls a URL. This is a generator meant to be used via a Task object.
200 * 302 *
201 * @param {String} url 303 * @param {String} url
202 * @param {TabAllocator} tabAllocator 304 * @param {TabAllocator} tabAllocator
305 * @param {int} timeout
306 * Load timeout in milliseconds
203 * @result {Object} 307 * @result {Object}
204 * Crawling result 308 * Crawling result
205 */ 309 */
206 function* crawl_url(url, tabAllocator, timeout) 310 function* crawl_url(url, tabAllocator, timeout)
207 { 311 {
208 let tab = yield tabAllocator.getTab(); 312 let tab = yield tabAllocator.getTab();
209 let result = {url, requests: []}; 313 let result = {url, requests: []};
210 314 let requestNotifier;
211 try 315 try
212 { 316 {
213 result.startTime = Date.now(); 317 result.startTime = Date.now();
214 let requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, f unction({type, location, filter}, scanComplete) 318 requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID,
sergei 2016/03/15 16:40:10 BTW, in addition, this part stops to work, I have
sergei 2016/03/16 14:44:23 https://issues.adblockplus.org/ticket/3815
215 { 319 function(entry, scanComplete)
216 result.requests.push({location, contentType: type, filter}); 320 {
321 if (!entry)
322 return;
323 let {type: contentType, location, filter} = entry;
324 result.requests.push({location, contentType, filter});
217 }); 325 });
218 326
219 tab.linkedBrowser.loadURI(url, null, null); 327 tab.linkedBrowser.loadURI(url, null, null);
220 328
221 let mm = tab.linkedBrowser.messageManager; 329 Object.assign(result, yield getPageInfo(tab, timeout));
222 let pageInfoFuture = new Promise((resolve, result) =>
223 {
224 let timerID;
225 let onDone = (pageInfo) =>
226 {
227 mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone);
Wladimir Palant 2016/03/15 10:07:10 So, which tab did you get the page info for? The
sergei 2016/03/16 14:44:23 For the `tab`, it's "browser message manager" it a
228 clearTimeout(timerID);
229 resolve(pageInfo);
230 }
231 mm.addMessageListener("abpcrawler:pageInfoGathered", (msg) => onDone(msg.d ata));;
232 timerID = setTimeout(onDone.bind(this, {error: "timeout"}), timeout);
233 });
Wladimir Palant 2016/03/15 10:07:10 Please move this functionality into a separate fun
sergei 2016/03/16 14:44:23 Done.
234
235 let pageInfo = yield pageInfoFuture;
236
237 result.finalUrl = tab.linkedBrowser.currentURI.spec; 330 result.finalUrl = tab.linkedBrowser.currentURI.spec;
238 Object.assign(result, pageInfo);
239 result.endTime = Date.now(); 331 result.endTime = Date.now();
240 } 332 }
241 finally 333 finally
242 { 334 {
335 if (requestNotifier)
336 requestNotifier.shutdown();
243 tabAllocator.releaseTab(tab); 337 tabAllocator.releaseTab(tab);
244 } 338 }
245 return result; 339 return result;
246 } 340 }
247 341
248 function reportException(e) 342 function reportException(e)
249 { 343 {
250 let stack = ""; 344 let stack = "";
251 if (e && typeof e == "object" && "stack" in e) 345 if (e && typeof e == "object" && "stack" in e)
252 stack = e.stack + "\n"; 346 stack = e.stack + "\n";
253 347
254 Cu.reportError(e); 348 Cu.reportError(e);
255 dump(e + "\n" + stack + "\n"); 349 dump(e + "\n" + stack + "\n");
256 } 350 }
257
258 let {addonRoot} = require("info");
259 let frameScriptPath = addonRoot + "/lib/child/frameScript.js";
260 let globalMessageManager = Services.mm;
261 globalMessageManager.loadFrameScript(frameScriptPath, true);
Wladimir Palant 2016/03/15 10:07:10 This should be a process script, no point using a
262
263 let onReportException = function(msg)
264 {
265 reportException(msg.objects);
Wladimir Palant 2016/03/15 10:07:10 Please don't use msg.objects - ever. That's a wrap
sergei 2016/03/16 14:44:23 Acknowledged. Actually, I wanted to avoid duplicat
sergei 2016/03/16 14:44:23 I know, I used `msg.objects` because we don't know
Wladimir Palant 2016/09/14 16:11:46 Worst-case scenario: deadlocks because all of that
266 }
267 globalMessageManager.addMessageListener("abpcrawler:reportException", onReportEx ception);
268
269 onShutdown.add(() =>
270 {
271 globalMessageManager.removeMessageListener("abpcrawler:reportException", onRep ortException);
272 globalMessageManager.removeDelayedFrameScript(frameScriptPath);
273 });
LEFTRIGHT

Powered by Google App Engine
This is Rietveld