LEFT | RIGHT |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # coding: utf-8 | 2 # coding: utf-8 |
3 | 3 |
4 import argparse | 4 import argparse |
5 import datetime | 5 import datetime |
6 import errno | 6 import errno |
7 import hashlib | 7 import hashlib |
8 import io | 8 import io |
9 import json | 9 import json |
10 import os | 10 import os |
11 import random | 11 import random |
12 import subprocess | 12 import subprocess |
13 import sys | 13 import sys |
14 import tempfile | 14 import tempfile |
15 import threading | 15 import threading |
16 import urllib | 16 import urllib |
17 import urlparse | 17 import urlparse |
18 from wsgiref.simple_server import make_server | 18 from wsgiref.simple_server import make_server |
19 | 19 |
20 from mozprofile import FirefoxProfile | 20 from mozprofile import FirefoxProfile |
21 from mozrunner import FirefoxRunner | 21 from mozrunner import FirefoxRunner |
22 | 22 |
| 23 |
23 class CrawlerApp: | 24 class CrawlerApp: |
24 server = None | 25 server = None |
25 def __init__(self, parameters): | 26 |
26 self.parameters = parameters | 27 def __init__(self, parameters): |
27 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle: | 28 self.parameters = parameters |
28 self.urls = map(unicode.strip, handle.readlines()) | 29 with io.open(self.parameters.list, 'r', encoding='utf-8') as handle: |
29 | 30 self.urls = map(unicode.strip, handle.readlines()) |
30 def __call__(self, environ, start_response): | 31 |
31 path = environ.get('PATH_INFO', '') | 32 def __call__(self, environ, start_response): |
32 if path == '/parameters': | 33 path = environ.get('PATH_INFO', '') |
33 start_response('200 OK', [('Content-Type', 'application/json')]) | 34 if path == '/parameters': |
34 return [json.dumps({ | 35 start_response('200 OK', [('Content-Type', 'application/json')]) |
35 'urls': self.urls, | 36 return [json.dumps({ |
36 'timeout': self.parameters.timeout * 1000, | 37 'urls': self.urls, |
37 'maxtabs': self.parameters.maxtabs, | 38 'timeout': self.parameters.timeout * 1000, |
38 })] | 39 'maxtabs': self.parameters.maxtabs, |
39 elif path == '/save': | 40 })] |
40 try: | 41 elif path == '/save': |
41 request_body_size = int(environ.get('CONTENT_LENGTH', 0)) | 42 try: |
42 except (ValueError): | 43 request_body_size = int(environ.get('CONTENT_LENGTH', 0)) |
43 start_response('400 Bad Request', []) | 44 except (ValueError): |
| 45 start_response('400 Bad Request', []) |
| 46 return '' |
| 47 |
| 48 data = json.loads(environ['wsgi.input'].read(request_body_size)) |
| 49 self.urls.remove(data['url']) |
| 50 |
| 51 fullurl = data['url'] |
| 52 if not urlparse.urlparse(fullurl).scheme: |
| 53 fullurl = 'http://' + fullurl |
| 54 parsedurl = urlparse.urlparse(fullurl) |
| 55 urlhash = hashlib.new('md5', data['url']).hexdigest() |
| 56 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000
.0).strftime('%Y-%m-%dT%H%M%S.%f') |
| 57 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) |
| 58 datapath = os.path.join(self.parameters.outdir, basename + ".json") |
| 59 screenshotpath = os.path.join(self.parameters.outdir, basename + ".j
pg") |
| 60 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") |
| 61 |
| 62 try: |
| 63 os.makedirs(self.parameters.outdir) |
| 64 except OSError as e: |
| 65 if e.errno != errno.EEXIST: |
| 66 raise |
| 67 |
| 68 if "screenshot" in data: |
| 69 with open(screenshotpath, 'wb') as handle: |
| 70 handle.write(urllib.urlopen(data["screenshot"]).read()) |
| 71 del data["screenshot"] |
| 72 |
| 73 if "source" in data: |
| 74 with io.open(sourcepath, 'w', encoding='utf-8') as handle: |
| 75 handle.write(data["source"]) |
| 76 del data["source"] |
| 77 |
| 78 with io.open(datapath, 'w', encoding='utf-8') as handle: |
| 79 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=Fal
se, sort_keys=True)) + u'\n') |
| 80 start_response('204 No Content', []) |
| 81 return '' |
| 82 |
| 83 start_response('404 Not Found', []) |
44 return '' | 84 return '' |
45 | 85 |
46 data = json.loads(environ['wsgi.input'].read(request_body_size)) | |
47 self.urls.remove(data['url']) | |
48 | |
49 fullurl = data['url'] | |
50 if not urlparse.urlparse(fullurl).scheme: | |
51 fullurl = 'http://' + fullurl | |
52 parsedurl = urlparse.urlparse(fullurl) | |
53 urlhash = hashlib.new('md5', data['url']).hexdigest() | |
54 timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).st
rftime('%Y-%m-%dT%H%M%S.%f') | |
55 basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) | |
56 datapath = os.path.join(self.parameters.outdir, basename + ".json") | |
57 screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") | |
58 sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") | |
59 | |
60 try: | |
61 os.makedirs(self.parameters.outdir) | |
62 except OSError as e: | |
63 if e.errno != errno.EEXIST: | |
64 raise | |
65 | |
66 if "screenshot" in data: | |
67 with open(screenshotpath, 'wb') as handle: | |
68 handle.write(urllib.urlopen(data["screenshot"]).read()) | |
69 del data["screenshot"] | |
70 | |
71 if "source" in data: | |
72 with io.open(sourcepath, 'w', encoding='utf-8') as handle: | |
73 handle.write(data["source"]) | |
74 del data["source"] | |
75 | |
76 with io.open(datapath, 'w', encoding='utf-8') as handle: | |
77 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort
_keys=True)) + u'\n') | |
78 start_response('204 No Content', []) | |
79 return '' | |
80 | |
81 start_response('404 Not Found', []) | |
82 return '' | |
83 | 86 |
84 def run(): | 87 def run(): |
85 parser = argparse.ArgumentParser(description='Run crawler') | 88 parser = argparse.ArgumentParser(description='Run crawler') |
86 parser.add_argument( | 89 parser.add_argument( |
87 '-b', '--binary', type=str, | 90 '-b', '--binary', type=str, |
88 help='path to the Firefox binary' | 91 help='path to the Firefox binary' |
89 ) | 92 ) |
90 parser.add_argument( | 93 parser.add_argument( |
91 '-a', '--abpdir', type=str, | 94 '-a', '--abpdir', type=str, |
92 help='path to the Adblock Plus repository' | 95 help='path to the Adblock Plus repository' |
93 ) | 96 ) |
94 parser.add_argument( | 97 parser.add_argument( |
95 '-f', '--filters', metavar='url', type=str, nargs='+', | 98 '-f', '--filters', metavar='url', type=str, nargs='+', |
96 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https:/
/easylist-downloads.adblockplus.org/exceptionrules.txt"], | 99 default=["https://easylist-downloads.adblockplus.org/easylist.txt", "htt
ps://easylist-downloads.adblockplus.org/exceptionrules.txt"], |
97 help='filter lists to install in Adblock Plus. The arguments can also have t
he format path=url, the data will be read from the specified path then.' | 100 help='filter lists to install in Adblock Plus. The arguments can also ha
ve the format path=url, the data will be read from the specified path then.' |
98 ) | 101 ) |
99 parser.add_argument( | 102 parser.add_argument( |
100 '-t', '--timeout', type=int, default=300, | 103 '-t', '--timeout', type=int, default=300, |
101 help='Load timeout (seconds)' | 104 help='Load timeout (seconds)' |
102 ) | 105 ) |
103 parser.add_argument( | 106 parser.add_argument( |
104 '-x', '--maxtabs', type=int, default=15, | 107 '-x', '--maxtabs', type=int, default=15, |
105 help='Maximal number of tabs to open in parallel' | 108 help='Maximal number of tabs to open in parallel' |
106 ) | 109 ) |
107 parser.add_argument( | 110 parser.add_argument( |
108 'list', type=str, | 111 'list', type=str, |
109 help='URL list to process' | 112 help='URL list to process' |
110 ) | 113 ) |
111 parser.add_argument( | 114 parser.add_argument( |
112 'outdir', type=str, | 115 'outdir', type=str, |
113 help='directory to write data into' | 116 help='directory to write data into' |
114 ) | 117 ) |
115 parameters = parser.parse_args() | 118 parameters = parser.parse_args() |
116 | 119 |
117 import buildtools.packagerGecko as packager | 120 import buildtools.packagerGecko as packager |
118 cleanup = [] | 121 cleanup = [] |
119 try: | 122 try: |
120 base_dir = os.path.dirname(os.path.abspath(__file__)) | 123 base_dir = os.path.dirname(os.path.abspath(__file__)) |
121 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') | 124 handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') |
122 os.close(handle) | 125 os.close(handle) |
123 cleanup.append(crawlerxpi) | 126 cleanup.append(crawlerxpi) |
124 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) | 127 packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) |
125 | 128 |
126 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-186
5-latest.xpi' | 129 abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon
-1865-latest.xpi' |
127 if parameters.abpdir: | 130 if parameters.abpdir: |
128 handle, abpxpi = tempfile.mkstemp(suffix='.xpi') | 131 handle, abpxpi = tempfile.mkstemp(suffix='.xpi') |
129 os.close(handle) | 132 os.close(handle) |
130 cleanup.append(abpxpi) | 133 cleanup.append(abpxpi) |
131 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) | 134 packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild
=True) |
132 | 135 |
133 profile = FirefoxProfile( | 136 profile = FirefoxProfile( |
134 addons=[ | 137 addons=[ |
135 crawlerxpi, | 138 crawlerxpi, |
136 abpxpi, | 139 abpxpi, |
137 ], | 140 ], |
138 preferences={ | 141 preferences={ |
139 'browser.startup.homepage': 'about:blank', | 142 'browser.startup.homepage': 'about:blank', |
140 'browser.tabs.warnOnCloseOtherTabs': False, | 143 'browser.tabs.warnOnCloseOtherTabs': False, |
141 'browser.uitour.enabled': False, | 144 'browser.uitour.enabled': False, |
142 'prompts.tab_modal.enabled': False, | 145 'prompts.tab_modal.enabled': False, |
143 'startup.homepage_welcome_url': 'about:blank', | 146 'startup.homepage_welcome_url': 'about:blank', |
144 'startup.homepage_welcome_url.additional': 'about:blank', | 147 'startup.homepage_welcome_url.additional': 'about:blank', |
145 'xpinstall.signatures.required': False, | 148 'xpinstall.signatures.required': False, |
146 } | 149 } |
147 ) | 150 ) |
148 | 151 |
149 abpsettings = os.path.join(profile.profile, 'adblockplus') | 152 abpsettings = os.path.join(profile.profile, 'adblockplus') |
150 os.makedirs(abpsettings) | 153 os.makedirs(abpsettings) |
151 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle: | 154 with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle: |
152 print >>handle, '# Adblock Plus preferences' | 155 print >>handle, '# Adblock Plus preferences' |
153 print >>handle, 'version=4' | 156 print >>handle, 'version=4' |
154 for url in parameters.filters: | 157 for url in parameters.filters: |
155 if '=' in url: | 158 if '=' in url: |
156 path, url = url.split('=', 1) | 159 path, url = url.split('=', 1) |
157 with open(path, 'r') as source: | 160 with open(path, 'r') as source: |
158 data = source.read() | 161 data = source.read() |
159 else: | 162 else: |
160 data = urllib.urlopen(url).read() | 163 data = urllib.urlopen(url).read() |
161 print >>handle, '[Subscription]' | 164 print >>handle, '[Subscription]' |
162 print >>handle, 'url=%s' % url | 165 print >>handle, 'url=%s' % url |
163 print >>handle, '[Subscription filters]' | 166 print >>handle, '[Subscription filters]' |
164 print >>handle, '\n'.join(data.splitlines()[1:]) | 167 print >>handle, '\n'.join(data.splitlines()[1:]) |
165 finally: | 168 finally: |
166 for path in cleanup: | 169 for path in cleanup: |
167 os.unlink(path) | 170 os.unlink(path) |
168 | 171 |
169 server = None | 172 server = None |
170 try: | 173 try: |
171 port = random.randrange(2000, 60000) | 174 port = random.randrange(2000, 60000) |
172 print "Communicating with client on port %i" % port | 175 print "Communicating with client on port %i" % port |
173 | 176 |
174 app = CrawlerApp(parameters) | 177 app = CrawlerApp(parameters) |
175 server = make_server('localhost', port, app) | 178 server = make_server('localhost', port, app) |
176 app.server = server | 179 app.server = server |
177 threading.Thread(target=lambda: server.serve_forever()).start() | 180 threading.Thread(target=lambda: server.serve_forever()).start() |
178 | 181 |
179 runner = FirefoxRunner( | 182 runner = FirefoxRunner( |
180 profile=profile, | 183 profile=profile, |
181 binary=parameters.binary, | 184 binary=parameters.binary, |
182 cmdargs=['--crawler-port', str(port)], | 185 cmdargs=['--crawler-port', str(port)], |
183 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), | 186 env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), |
184 ) | 187 ) |
185 while app.urls: | 188 while app.urls: |
186 runner.start() | 189 runner.start() |
187 runner.wait() | 190 runner.wait() |
188 finally: | 191 finally: |
189 if server: | 192 if server: |
190 server.shutdown() | 193 server.shutdown() |
191 profile.cleanup() | 194 profile.cleanup() |
192 | 195 |
193 if __name__ == '__main__': | 196 if __name__ == '__main__': |
194 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 197 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
195 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") | 198 DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") |
196 | 199 |
197 try: | 200 try: |
198 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) | 201 subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) |
199 except subprocess.CalledProcessError as e: | 202 except subprocess.CalledProcessError as e: |
200 print >>sys.stderr, e | 203 print >>sys.stderr, e |
201 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" | 204 print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" |
202 | 205 |
203 run() | 206 run() |
LEFT | RIGHT |