OLD | NEW |
1 # This file is part of the Adblock Plus web scripts, | 1 # This file is part of the Adblock Plus web scripts, |
2 # Copyright (C) 2006-2016 Eyeo GmbH | 2 # Copyright (C) 2006-2016 Eyeo GmbH |
3 # | 3 # |
4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
7 # | 7 # |
8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
(...skipping 29 matching lines...) Expand all Loading... |
40 log_regexp = None | 40 log_regexp = None |
41 gecko_apps = None | 41 gecko_apps = None |
42 | 42 |
43 | 43 |
44 class StatsFile: | 44 class StatsFile: |
45 def __init__(self, path): | 45 def __init__(self, path): |
46 self._inner_file = None | 46 self._inner_file = None |
47 self._processes = [] | 47 self._processes = [] |
48 | 48 |
49 parseresult = urlparse.urlparse(path) | 49 parseresult = urlparse.urlparse(path) |
50 if parseresult.scheme == "ssh" and parseresult.username and parseresult.
hostname and parseresult.path: | 50 if parseresult.scheme == 'ssh' and parseresult.username and parseresult.
hostname and parseresult.path: |
51 command = [ | 51 command = [ |
52 "ssh", "-q", "-o", "NumberOfPasswordPrompts 0", "-T", "-k", | 52 'ssh', '-q', '-o', 'NumberOfPasswordPrompts 0', '-T', '-k', |
53 "-l", parseresult.username, | 53 '-l', parseresult.username, |
54 parseresult.hostname, | 54 parseresult.hostname, |
55 parseresult.path.lstrip("/") | 55 parseresult.path.lstrip('/') |
56 ] | 56 ] |
57 if parseresult.port: | 57 if parseresult.port: |
58 command[1:1] = ["-P", str(parseresult.port)] | 58 command[1:1] = ['-P', str(parseresult.port)] |
59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE) | 59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE) |
60 self._processes.append(ssh_process) | 60 self._processes.append(ssh_process) |
61 self._file = ssh_process.stdout | 61 self._file = ssh_process.stdout |
62 elif parseresult.scheme in ("http", "https"): | 62 elif parseresult.scheme in ('http', 'https'): |
63 self._file = urllib.urlopen(path) | 63 self._file = urllib.urlopen(path) |
64 elif os.path.exists(path): | 64 elif os.path.exists(path): |
65 self._file = open(path, "rb") | 65 self._file = open(path, 'rb') |
66 else: | 66 else: |
67 raise IOError("Path '%s' not recognized" % path) | 67 raise IOError("Path '%s' not recognized" % path) |
68 | 68 |
69 if path.endswith(".gz"): | 69 if path.endswith('.gz'): |
70 # Built-in gzip module doesn't support streaming (fixed in Python 3.
2) | 70 # Built-in gzip module doesn't support streaming (fixed in Python 3.
2) |
71 gzip_process = subprocess.Popen(["gzip", "-cd"], stdin=self._file, s
tdout=subprocess.PIPE) | 71 gzip_process = subprocess.Popen(['gzip', '-cd'], stdin=self._file, s
tdout=subprocess.PIPE) |
72 self._processes.append(gzip_process) | 72 self._processes.append(gzip_process) |
73 self._file, self._inner_file = gzip_process.stdout, self._file | 73 self._file, self._inner_file = gzip_process.stdout, self._file |
74 | 74 |
75 def __getattr__(self, name): | 75 def __getattr__(self, name): |
76 return getattr(self._file, name) | 76 return getattr(self._file, name) |
77 | 77 |
78 def close(self): | 78 def close(self): |
79 self._file.close() | 79 self._file.close() |
80 if self._inner_file: | 80 if self._inner_file: |
81 self._inner_file.close() | 81 self._inner_file.close() |
82 for process in self._processes: | 82 for process in self._processes: |
83 process.wait() | 83 process.wait() |
84 | 84 |
85 | 85 |
86 def get_stats_files(): | 86 def get_stats_files(): |
87 config = get_config() | 87 config = get_config() |
88 | 88 |
89 prefix = "mirror_" | 89 prefix = 'mirror_' |
90 options = filter(lambda o: o.startswith(prefix), config.options("stats")) | 90 options = filter(lambda o: o.startswith(prefix), config.options('stats')) |
91 for option in options: | 91 for option in options: |
92 if config.has_option("stats", option): | 92 if config.has_option('stats', option): |
93 value = config.get("stats", option) | 93 value = config.get('stats', option) |
94 if " " in value: | 94 if ' ' in value: |
95 yield [option[len(prefix):]] + value.split(None, 1) | 95 yield [option[len(prefix):]] + value.split(None, 1) |
96 else: | 96 else: |
97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt
ion, value) | 97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt
ion, value) |
98 else: | 98 else: |
99 print >>sys.stderr, "Option '%s' not found in the configuration" % o
ption | 99 print >>sys.stderr, "Option '%s' not found in the configuration" % o
ption |
100 | 100 |
101 | 101 |
102 def cache_lru(func): | 102 def cache_lru(func): |
103 """ | 103 """ |
104 Decorator that memoizes the return values of a single-parameter function i
n | 104 Decorator that memoizes the return values of a single-parameter function i
n |
(...skipping 21 matching lines...) Expand all Loading... |
126 results[arg] = result | 126 results[arg] = result |
127 return result | 127 return result |
128 return wrapped | 128 return wrapped |
129 | 129 |
130 | 130 |
131 def cache_last(func): | 131 def cache_last(func): |
132 """ | 132 """ |
133 Decorator that memoizes the last return value of a function in case it is | 133 Decorator that memoizes the last return value of a function in case it is |
134 called again with the same parameters. | 134 called again with the same parameters. |
135 """ | 135 """ |
136 result = {"args": None, "result": None} | 136 result = {'args': None, 'result': None} |
137 | 137 |
138 def wrapped(*args): | 138 def wrapped(*args): |
139 if args != result["args"]: | 139 if args != result['args']: |
140 result["result"] = func(*args) | 140 result['result'] = func(*args) |
141 result["args"] = args | 141 result['args'] = args |
142 return result["result"] | 142 return result['result'] |
143 return wrapped | 143 return wrapped |
144 | 144 |
145 | 145 |
146 @cache_lru | 146 @cache_lru |
147 def parse_ua(ua): | 147 def parse_ua(ua): |
148 # Opera might disguise itself as other browser so it needs to go first | 148 # Opera might disguise itself as other browser so it needs to go first |
149 match = re.search(r"\bOpera/([\d\.]+)", ua) | 149 match = re.search(r'\bOpera/([\d\.]+)', ua) |
150 if match: | 150 if match: |
151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U
A | 151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U
A |
152 match2 = re.search(r"\bVersion/([\d\.]+)", ua) | 152 match2 = re.search(r'\bVersion/([\d\.]+)', ua) |
153 if match2: | 153 if match2: |
154 return "Opera", match2.group(1) | 154 return 'Opera', match2.group(1) |
155 else: | 155 else: |
156 return "Opera", match.group(1) | 156 return 'Opera', match.group(1) |
157 | 157 |
158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it | 158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it |
159 match = re.search(r"\bOPR/(\d+\.\d+)", ua) | 159 match = re.search(r'\bOPR/(\d+\.\d+)', ua) |
160 if match: | 160 if match: |
161 return "Opera", match.group(1) | 161 return 'Opera', match.group(1) |
162 | 162 |
163 # Have to check for these before Firefox, they will usually have a Firefox i
dentifier as well | 163 # Have to check for these before Firefox, they will usually have a Firefox i
dentifier as well |
164 match = re.search(r"\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)
/(\d+\.\d+)", ua) | 164 match = re.search(r'\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)
/(\d+\.\d+)', ua) |
165 if match: | 165 if match: |
166 if match.group(1) == "Fennec": | 166 if match.group(1) == 'Fennec': |
167 return "Firefox Mobile", match.group(2) | 167 return 'Firefox Mobile', match.group(2) |
168 else: | 168 else: |
169 return match.group(1), match.group(2) | 169 return match.group(1), match.group(2) |
170 | 170 |
171 match = re.search(r"\bFirefox/(\d+\.\d+)", ua) | 171 match = re.search(r'\bFirefox/(\d+\.\d+)', ua) |
172 if match: | 172 if match: |
173 if re.search(r"\bMobile;", ua): | 173 if re.search(r'\bMobile;', ua): |
174 return "Firefox Mobile", match.group(1) | 174 return 'Firefox Mobile', match.group(1) |
175 elif re.search(r"\bTablet;", ua): | 175 elif re.search(r'\bTablet;', ua): |
176 return "Firefox Tablet", match.group(1) | 176 return 'Firefox Tablet', match.group(1) |
177 else: | 177 else: |
178 return "Firefox", match.group(1) | 178 return 'Firefox', match.group(1) |
179 | 179 |
180 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua) | 180 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua) |
181 if match and re.search(r"\bGecko/", ua): | 181 if match and re.search(r'\bGecko/', ua): |
182 if match.group(3) and int(match.group(1)) < 2: | 182 if match.group(3) and int(match.group(1)) < 2: |
183 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.
group(3)) | 183 return 'Gecko', '%s.%s.%s' % (match.group(1), match.group(2), match.
group(3)) |
184 else: | 184 else: |
185 return "Gecko", "%s.%s" % (match.group(1), match.group(2)) | 185 return 'Gecko', '%s.%s' % (match.group(1), match.group(2)) |
186 | 186 |
187 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua) | 187 match = re.search(r'\bCoolNovo/(\d+\.\d+\.\d+)', ua) |
188 if match: | 188 if match: |
189 return "CoolNovo", match.group(1) | 189 return 'CoolNovo', match.group(1) |
190 | 190 |
191 match = re.search(r"\bEdge/(\d+)\.\d+", ua) | 191 match = re.search(r'\bEdge/(\d+)\.\d+', ua) |
192 if match: | 192 if match: |
193 return "Edge", match.group(1) | 193 return 'Edge', match.group(1) |
194 | 194 |
195 match = re.search(r"\bChrome/(\d+\.\d+)", ua) | 195 match = re.search(r'\bChrome/(\d+\.\d+)', ua) |
196 if match: | 196 if match: |
197 return "Chrome", match.group(1) | 197 return 'Chrome', match.group(1) |
198 | 198 |
199 match = re.search(r"\bVersion/(\d+\.\d+)", ua) | 199 match = re.search(r'\bVersion/(\d+\.\d+)', ua) |
200 if match and re.search(r"\bMobile Safari/", ua): | 200 if match and re.search(r'\bMobile Safari/', ua): |
201 return "Mobile Safari", match.group(1) | 201 return 'Mobile Safari', match.group(1) |
202 if match and re.search(r"\bSafari/", ua): | 202 if match and re.search(r'\bSafari/', ua): |
203 return "Safari", match.group(1) | 203 return 'Safari', match.group(1) |
204 | 204 |
205 if re.search(r"\bAppleWebKit/", ua): | 205 if re.search(r'\bAppleWebKit/', ua): |
206 return "WebKit", "" | 206 return 'WebKit', '' |
207 | 207 |
208 match = re.search(r"\bMSIE (\d+\.\d+)", ua) | 208 match = re.search(r'\bMSIE (\d+\.\d+)', ua) |
209 if match: | 209 if match: |
210 return "MSIE", match.group(1) | 210 return 'MSIE', match.group(1) |
211 | 211 |
212 match = re.search(r"\bTrident/(\d+\.\d+)", ua) | 212 match = re.search(r'\bTrident/(\d+\.\d+)', ua) |
213 if match: | 213 if match: |
214 match2 = re.search(r"\brv:(\d+\.\d+)", ua) | 214 match2 = re.search(r'\brv:(\d+\.\d+)', ua) |
215 if match2: | 215 if match2: |
216 return "MSIE", match2.group(1) | 216 return 'MSIE', match2.group(1) |
217 else: | 217 else: |
218 return "Trident", match.group(1) | 218 return 'Trident', match.group(1) |
219 | 219 |
220 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua) | 220 match = re.search(r'\bAndroidDownloadManager(?:/(\d+\.\d+))?', ua) |
221 if match: | 221 if match: |
222 return "Android", match.group(1) or "" | 222 return 'Android', match.group(1) or '' |
223 | 223 |
224 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua) | 224 match = re.search(r'\bDalvik/.*\bAndroid (\d+\.\d+)', ua) |
225 if match: | 225 if match: |
226 return "Android", match.group(1) | 226 return 'Android', match.group(1) |
227 | 227 |
228 # ABP/Android downloads use that user agent | 228 # ABP/Android downloads use that user agent |
229 if ua.startswith("Apache-HttpClient/UNAVAILABLE"): | 229 if ua.startswith('Apache-HttpClient/UNAVAILABLE'): |
230 return "Android", "" | 230 return 'Android', '' |
231 | 231 |
232 # ABP/IE downloads use that user agent | 232 # ABP/IE downloads use that user agent |
233 if ua == "Adblock Plus": | 233 if ua == 'Adblock Plus': |
234 return "ABP", "" | 234 return 'ABP', '' |
235 | 235 |
236 return "Other", "" | 236 return 'Other', '' |
237 | 237 |
238 | 238 |
239 def process_ip(ip, geo, geov6): | 239 def process_ip(ip, geo, geov6): |
240 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) | 240 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip) |
241 if match: | 241 if match: |
242 ip = match.group(1) | 242 ip = match.group(1) |
243 | 243 |
244 try: | 244 try: |
245 if ":" in ip: | 245 if ':' in ip: |
246 country = geov6.country_code_by_addr(ip) | 246 country = geov6.country_code_by_addr(ip) |
247 else: | 247 else: |
248 country = geo.country_code_by_addr(ip) | 248 country = geo.country_code_by_addr(ip) |
249 except: | 249 except: |
250 traceback.print_exc() | 250 traceback.print_exc() |
251 country = "" | 251 country = '' |
252 | 252 |
253 if country in (None, "", "--"): | 253 if country in (None, '', '--'): |
254 country = "unknown" | 254 country = 'unknown' |
255 country = country.lower() | 255 country = country.lower() |
256 | 256 |
257 return ip, country | 257 return ip, country |
258 | 258 |
259 | 259 |
260 @cache_last | 260 @cache_last |
261 def parse_time(timestr, tz_hours, tz_minutes): | 261 def parse_time(timestr, tz_hours, tz_minutes): |
262 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") | 262 result = datetime.strptime(timestr, '%d/%b/%Y:%H:%M:%S') |
263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou
rs)) | 263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou
rs)) |
264 return result, result.strftime("%Y%m"), result.day, result.weekday(), result
.hour | 264 return result, result.strftime('%Y%m'), result.day, result.weekday(), result
.hour |
265 | 265 |
266 | 266 |
267 @cache_lru | 267 @cache_lru |
268 def parse_path(path): | 268 def parse_path(path): |
269 urlparts = urlparse.urlparse(path) | 269 urlparts = urlparse.urlparse(path) |
270 try: | 270 try: |
271 path = urllib.unquote(urlparts.path).decode("utf-8") | 271 path = urllib.unquote(urlparts.path).decode('utf-8') |
272 except: | 272 except: |
273 path = urlparts.path | 273 path = urlparts.path |
274 return path[1:], urlparts.query | 274 return path[1:], urlparts.query |
275 | 275 |
276 | 276 |
277 @cache_lru | 277 @cache_lru |
278 def parse_query(query): | 278 def parse_query(query): |
279 return urlparse.parse_qs(query) | 279 return urlparse.parse_qs(query) |
280 | 280 |
281 | 281 |
282 @cache_lru | 282 @cache_lru |
283 def parse_lastversion(last_version): | 283 def parse_lastversion(last_version): |
284 if '-' in last_version: | 284 if '-' in last_version: |
285 last_version = last_version.split('-', 1)[0] | 285 last_version = last_version.split('-', 1)[0] |
286 return datetime.strptime(last_version, "%Y%m%d%H%M") | 286 return datetime.strptime(last_version, '%Y%m%d%H%M') |
287 | 287 |
288 | 288 |
289 @cache_lru | 289 @cache_lru |
290 def get_week(date): | 290 def get_week(date): |
291 return date.isocalendar()[0:2] | 291 return date.isocalendar()[0:2] |
292 | 292 |
293 | 293 |
294 def parse_downloader_query(info): | 294 def parse_downloader_query(info): |
295 params = parse_query(info["query"]) | 295 params = parse_query(info['query']) |
296 for param in ("addonName", "addonVersion", "application", "applicationVersio
n", "platform", "platformVersion"): | 296 for param in ('addonName', 'addonVersion', 'application', 'applicationVersio
n', 'platform', 'platformVersion'): |
297 info[param] = params.get(param, ["unknown"])[0] | 297 info[param] = params.get(param, ['unknown'])[0] |
298 | 298 |
299 # Only leave the major and minor release number for application and platform | 299 # Only leave the major and minor release number for application and platform |
300 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["applicati
onVersion"]) | 300 info['applicationVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['applicati
onVersion']) |
301 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVers
ion"]) | 301 info['platformVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['platformVers
ion']) |
302 | 302 |
303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters | 303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters |
304 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["cl
ientid"] else None | 304 match = re.match(r'^adblock/([\d\.]+)$', info['clientid'], re.I) if info['cl
ientid'] else None |
305 if match: | 305 if match: |
306 info["addonName"] = "chromeadblock" | 306 info['addonName'] = 'chromeadblock' |
307 info["addonVersion"] = match.group(1) | 307 info['addonVersion'] = match.group(1) |
308 | 308 |
309 last_version = params.get("lastVersion", ["unknown"])[0] | 309 last_version = params.get('lastVersion', ['unknown'])[0] |
310 if info["file"] == "notification.json" and last_version == "0" and ( | 310 if info['file'] == 'notification.json' and last_version == '0' and ( |
311 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1")
or | 311 (info['addonName'] == 'adblockplus' and info['addonVersion'] == '2.3.1')
or |
312 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info
["addonVersion"] == "1.5.2") | 312 (info['addonName'] in ('adblockpluschrome', 'adblockplusopera') and info
['addonVersion'] == '1.5.2') |
313 ): | 313 ): |
314 # Broken notification version number in these releases, treat like unkno
wn | 314 # Broken notification version number in these releases, treat like unkno
wn |
315 last_version = "unknown" | 315 last_version = 'unknown' |
316 | 316 |
317 if last_version == "unknown": | 317 if last_version == 'unknown': |
318 info["downloadInterval"] = "unknown" | 318 info['downloadInterval'] = 'unknown' |
319 info["previousDownload"] = "unknown" | 319 info['previousDownload'] = 'unknown' |
320 elif last_version == "0": | 320 elif last_version == '0': |
321 info["downloadInterval"] = "unknown" | 321 info['downloadInterval'] = 'unknown' |
322 info["previousDownload"] = "unknown" | 322 info['previousDownload'] = 'unknown' |
323 info["firstDownload"] = True | 323 info['firstDownload'] = True |
324 else: | 324 else: |
325 try: | 325 try: |
326 last_update = parse_lastversion(last_version) | 326 last_update = parse_lastversion(last_version) |
327 diff = info["time"] - last_update | 327 diff = info['time'] - last_update |
328 if diff.days >= 365: | 328 if diff.days >= 365: |
329 info["downloadInterval"] = "%i year(s)" % (diff.days / 365) | 329 info['downloadInterval'] = '%i year(s)' % (diff.days / 365) |
330 elif diff.days >= 30: | 330 elif diff.days >= 30: |
331 info["downloadInterval"] = "%i month(s)" % (diff.days / 30) | 331 info['downloadInterval'] = '%i month(s)' % (diff.days / 30) |
332 elif diff.days >= 1: | 332 elif diff.days >= 1: |
333 info["downloadInterval"] = "%i day(s)" % diff.days | 333 info['downloadInterval'] = '%i day(s)' % diff.days |
334 else: | 334 else: |
335 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600) | 335 info['downloadInterval'] = '%i hour(s)' % (diff.seconds / 3600) |
336 | 336 |
337 if info["addonName"].startswith("adblockplus"): | 337 if info['addonName'].startswith('adblockplus'): |
338 diffdays = (info["time"].date() - last_update.date()).days | 338 diffdays = (info['time'].date() - last_update.date()).days |
339 if diffdays == 0: | 339 if diffdays == 0: |
340 info["previousDownload"] = "same day" | 340 info['previousDownload'] = 'same day' |
341 elif diffdays < 30: | 341 elif diffdays < 30: |
342 info["previousDownload"] = "%i day(s)" % diffdays | 342 info['previousDownload'] = '%i day(s)' % diffdays |
343 elif diffdays < 365: | 343 elif diffdays < 365: |
344 info["previousDownload"] = "%i month(s)" % (diffdays / 30) | 344 info['previousDownload'] = '%i month(s)' % (diffdays / 30) |
345 else: | 345 else: |
346 info["previousDownload"] = "%i year(s)" % (diffdays / 365) | 346 info['previousDownload'] = '%i year(s)' % (diffdays / 365) |
347 else: | 347 else: |
348 info["previousDownload"] = "unknown" | 348 info['previousDownload'] = 'unknown' |
349 | 349 |
350 if last_update.year != info["time"].year or last_update.month != inf
o["time"].month: | 350 if last_update.year != info['time'].year or last_update.month != inf
o['time'].month: |
351 info["firstInMonth"] = info["firstInDay"] = True | 351 info['firstInMonth'] = info['firstInDay'] = True |
352 elif last_update.day != info["time"].day: | 352 elif last_update.day != info['time'].day: |
353 info["firstInDay"] = True | 353 info['firstInDay'] = True |
354 | 354 |
355 if get_week(last_update) != get_week(info["time"]): | 355 if get_week(last_update) != get_week(info['time']): |
356 info["firstInWeek"] = True | 356 info['firstInWeek'] = True |
357 except ValueError: | 357 except ValueError: |
358 info["downloadInterval"] = "unknown" | 358 info['downloadInterval'] = 'unknown' |
359 info["previousDownload"] = "unknown" | 359 info['previousDownload'] = 'unknown' |
360 pass | 360 pass |
361 | 361 |
362 | 362 |
363 def parse_addon_name(file): | 363 def parse_addon_name(file): |
364 if "/" in file: | 364 if '/' in file: |
365 return file.split("/")[-2] | 365 return file.split('/')[-2] |
366 else: | 366 else: |
367 return None | 367 return None |
368 | 368 |
369 | 369 |
370 def parse_gecko_query(query): | 370 def parse_gecko_query(query): |
371 params = urlparse.parse_qs(query) | 371 params = urlparse.parse_qs(query) |
372 | 372 |
373 version = params.get("version", ["unknown"])[0] | 373 version = params.get('version', ['unknown'])[0] |
374 | 374 |
375 global gecko_apps | 375 global gecko_apps |
376 if gecko_apps == None: | 376 if gecko_apps == None: |
377 from buildtools.packagerGecko import KNOWN_APPS | 377 from buildtools.packagerGecko import KNOWN_APPS |
378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} | 378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} |
379 appID = params.get("appID", ["unknown"])[0] | 379 appID = params.get('appID', ['unknown'])[0] |
380 | 380 |
381 application = gecko_apps.get(appID, "unknown") | 381 application = gecko_apps.get(appID, 'unknown') |
382 applicationVersion = params.get("appVersion", ["unknown"])[0] | 382 applicationVersion = params.get('appVersion', ['unknown'])[0] |
383 | 383 |
384 # Only leave the major and minor release number for application | 384 # Only leave the major and minor release number for application |
385 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) | 385 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion) |
386 | 386 |
387 return version, application, applicationVersion | 387 return version, application, applicationVersion |
388 | 388 |
389 | 389 |
390 def parse_chrome_query(query): | 390 def parse_chrome_query(query): |
391 params = urlparse.parse_qs(query) | 391 params = urlparse.parse_qs(query) |
392 | 392 |
393 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"): | 393 if params.get('prod', ['unknown'])[0] in ('chromecrx', 'chromiumcrx'): |
394 application = "chrome" | 394 application = 'chrome' |
395 else: | 395 else: |
396 application = "unknown" | 396 application = 'unknown' |
397 applicationVersion = params.get("prodversion", ["unknown"])[0] | 397 applicationVersion = params.get('prodversion', ['unknown'])[0] |
398 | 398 |
399 params2 = urlparse.parse_qs(params.get("x", [""])[0]) | 399 params2 = urlparse.parse_qs(params.get('x', [''])[0]) |
400 version = params2.get("v", ["unknown"])[0] | 400 version = params2.get('v', ['unknown'])[0] |
401 | 401 |
402 # Only leave the major and minor release number for application | 402 # Only leave the major and minor release number for application |
403 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) | 403 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion) |
404 | 404 |
405 return version, application, applicationVersion | 405 return version, application, applicationVersion |
406 | 406 |
407 | 407 |
408 def parse_update_flag(query): | 408 def parse_update_flag(query): |
409 return "update" if query == "update" else "install" | 409 return 'update' if query == 'update' else 'install' |
410 | 410 |
411 | 411 |
412 def parse_record(line, ignored, geo, geov6): | 412 def parse_record(line, ignored, geo, geov6): |
413 global log_regexp | 413 global log_regexp |
414 if log_regexp == None: | 414 if log_regexp == None: |
415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\]
"GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^
"]*" "([^"]*)")?') | 415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\]
"GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^
"]*" "([^"]*)")?') |
416 | 416 |
417 match = re.search(log_regexp, line) | 417 match = re.search(log_regexp, line) |
418 if not match: | 418 if not match: |
419 return None | 419 return None |
420 | 420 |
421 status = int(match.group(6)) | 421 status = int(match.group(6)) |
422 if status not in (200, 301, 302): | 422 if status not in (200, 301, 302): |
423 return None | 423 return None |
424 | 424 |
425 info = { | 425 info = { |
426 "status": status, | 426 'status': status, |
427 "size": int(match.group(7)), | 427 'size': int(match.group(7)), |
428 } | 428 } |
429 | 429 |
430 info["ip"], info["country"] = process_ip(match.group(1), geo, geov6) | 430 info['ip'], info['country'] = process_ip(match.group(1), geo, geov6) |
431 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pa
rse_time(match.group(2), int(match.group(3)), int(match.group(4))) | 431 info['time'], info['month'], info['day'], info['weekday'], info['hour'] = pa
rse_time(match.group(2), int(match.group(3)), int(match.group(4))) |
432 info["file"], info["query"] = parse_path(match.group(5)) | 432 info['file'], info['query'] = parse_path(match.group(5)) |
433 info["referrer"] = match.group(8) | 433 info['referrer'] = match.group(8) |
434 info["ua"], info["uaversion"] = parse_ua(match.group(9)) | 434 info['ua'], info['uaversion'] = parse_ua(match.group(9)) |
435 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) | 435 info['fullua'] = '%s %s' % (info['ua'], info['uaversion']) |
436 info["clientid"] = match.group(10) | 436 info['clientid'] = match.group(10) |
437 | 437 |
438 # Additional metadata depends on file type | 438 # Additional metadata depends on file type |
439 filename = os.path.basename(info["file"]) | 439 filename = os.path.basename(info['file']) |
440 ext = os.path.splitext(filename)[1] | 440 ext = os.path.splitext(filename)[1] |
441 if ext == ".txt" or filename == "update.json" or filename == "notification.j
son": | 441 if ext == '.txt' or filename == 'update.json' or filename == 'notification.j
son': |
442 # Subscription downloads, libadblockplus update checks and notification | 442 # Subscription downloads, libadblockplus update checks and notification |
443 # checks are performed by the downloader | 443 # checks are performed by the downloader |
444 parse_downloader_query(info) | 444 parse_downloader_query(info) |
445 elif ext == ".tpl": | 445 elif ext == '.tpl': |
446 # MSIE TPL download, no additional data here | 446 # MSIE TPL download, no additional data here |
447 pass | 447 pass |
448 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe", ".safariextz"): | 448 elif ext in ('.xpi', '.crx', '.apk', '.msi', '.exe', '.safariextz'): |
449 # Package download, might be an update | 449 # Package download, might be an update |
450 info["installType"] = parse_update_flag(info["query"]) | 450 info['installType'] = parse_update_flag(info['query']) |
451 elif filename == "update.rdf": | 451 elif filename == 'update.rdf': |
452 # Gecko update check or a legacy Android update check. The latter doesn'
t | 452 # Gecko update check or a legacy Android update check. The latter doesn'
t |
453 # have usable data anyway so trying the Chrome route won't do any harm. | 453 # have usable data anyway so trying the Chrome route won't do any harm. |
454 info["addonName"] = parse_addon_name(info["file"]) | 454 info['addonName'] = parse_addon_name(info['file']) |
455 info["addonVersion"], info["application"], info["applicationVersion"] =
parse_gecko_query(info["query"]) | 455 info['addonVersion'], info['application'], info['applicationVersion'] =
parse_gecko_query(info['query']) |
456 elif filename == "updates.xml": | 456 elif filename == 'updates.xml': |
457 # Chrome update check | 457 # Chrome update check |
458 info["addonName"] = parse_addon_name(info["file"]) | 458 info['addonName'] = parse_addon_name(info['file']) |
459 info["addonVersion"], info["application"], info["applicationVersion"] =
parse_chrome_query(info["query"]) | 459 info['addonVersion'], info['application'], info['applicationVersion'] =
parse_chrome_query(info['query']) |
460 elif filename == "updates.plist": | 460 elif filename == 'updates.plist': |
461 # Safari update check, no additional data | 461 # Safari update check, no additional data |
462 pass | 462 pass |
463 else: | 463 else: |
464 ignored.add(info["file"]) | 464 ignored.add(info['file']) |
465 return None | 465 return None |
466 | 466 |
467 if "addonName" in info: | 467 if 'addonName' in info: |
468 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"]) | 468 info['fullAddon'] = '%s %s' % (info['addonName'], info['addonVersion']) |
469 if "application" in info: | 469 if 'application' in info: |
470 info["fullApplication"] = "%s %s" % (info["application"], info["applicat
ionVersion"]) | 470 info['fullApplication'] = '%s %s' % (info['application'], info['applicat
ionVersion']) |
471 if "platform" in info: | 471 if 'platform' in info: |
472 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersio
n"]) | 472 info['fullPlatform'] = '%s %s' % (info['platform'], info['platformVersio
n']) |
473 return info | 473 return info |
474 | 474 |
475 | 475 |
476 def add_record(info, section, ignore_fields=()): | 476 def add_record(info, section, ignore_fields=()): |
477 section["hits"] = section.get("hits", 0) + 1 | 477 section['hits'] = section.get('hits', 0) + 1 |
478 section["bandwidth"] = section.get("bandwidth", 0) + info["size"] | 478 section['bandwidth'] = section.get('bandwidth', 0) + info['size'] |
479 | 479 |
480 if len(ignore_fields) < 2: | 480 if len(ignore_fields) < 2: |
481 for field in map(lambda f: f["name"], common.fields): | 481 for field in map(lambda f: f['name'], common.fields): |
482 if field in ignore_fields or field not in info: | 482 if field in ignore_fields or field not in info: |
483 continue | 483 continue |
484 | 484 |
485 value = info[field] | 485 value = info[field] |
486 if field not in section: | 486 if field not in section: |
487 section[field] = {} | 487 section[field] = {} |
488 if value not in section[field]: | 488 if value not in section[field]: |
489 section[field][value] = {} | 489 section[field][value] = {} |
490 | 490 |
491 add_record(info, section[field][value], ignore_fields + (field,)) | 491 add_record(info, section[field][value], ignore_fields + (field,)) |
492 | 492 |
493 | 493 |
494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored): | 494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored): |
495 data = {} | 495 data = {} |
496 for line in fileobj: | 496 for line in fileobj: |
497 info = parse_record(line, ignored, geo, geov6) | 497 info = parse_record(line, ignored, geo, geov6) |
498 if info == None: | 498 if info == None: |
499 continue | 499 continue |
500 | 500 |
501 info["mirror"] = mirror_name | 501 info['mirror'] = mirror_name |
502 if info["month"] not in data: | 502 if info['month'] not in data: |
503 data[info["month"]] = {} | 503 data[info['month']] = {} |
504 section = data[info["month"]] | 504 section = data[info['month']] |
505 | 505 |
506 if info["file"] not in section: | 506 if info['file'] not in section: |
507 section[info["file"]] = {} | 507 section[info['file']] = {} |
508 section = section[info["file"]] | 508 section = section[info['file']] |
509 | 509 |
510 add_record(info, section) | 510 add_record(info, section) |
511 return data | 511 return data |
512 | 512 |
513 | 513 |
514 def merge_objects(object1, object2, factor=1): | 514 def merge_objects(object1, object2, factor=1): |
515 for key, value in object2.iteritems(): | 515 for key, value in object2.iteritems(): |
516 try: | 516 try: |
517 key = unicode(key) | 517 key = unicode(key) |
518 except UnicodeDecodeError: | 518 except UnicodeDecodeError: |
519 key = unicode(key, encoding="latin-1") | 519 key = unicode(key, encoding='latin-1') |
520 if isinstance(value, numbers.Number): | 520 if isinstance(value, numbers.Number): |
521 object1[key] = object1.get(key, 0) + factor * value | 521 object1[key] = object1.get(key, 0) + factor * value |
522 else: | 522 else: |
523 merge_objects(object1.setdefault(key, {}), value, factor) | 523 merge_objects(object1.setdefault(key, {}), value, factor) |
524 | 524 |
525 | 525 |
526 def save_stats(server_type, data, factor=1): | 526 def save_stats(server_type, data, factor=1): |
527 base_dir = os.path.join(get_config().get("stats", "dataDirectory"), common.f
ilename_encode(server_type)) | 527 base_dir = os.path.join(get_config().get('stats', 'dataDirectory'), common.f
ilename_encode(server_type)) |
528 for month, month_data in data.iteritems(): | 528 for month, month_data in data.iteritems(): |
529 for name, file_data in month_data.iteritems(): | 529 for name, file_data in month_data.iteritems(): |
530 path = os.path.join(base_dir, common.filename_encode(month), common.
filename_encode(name + ".json")) | 530 path = os.path.join(base_dir, common.filename_encode(month), common.
filename_encode(name + '.json')) |
531 if os.path.exists(path): | 531 if os.path.exists(path): |
532 with codecs.open(path, "rb", encoding="utf-8") as fileobj: | 532 with codecs.open(path, 'rb', encoding='utf-8') as fileobj: |
533 existing = json.load(fileobj) | 533 existing = json.load(fileobj) |
534 else: | 534 else: |
535 existing = {} | 535 existing = {} |
536 | 536 |
537 merge_objects(existing, file_data, factor) | 537 merge_objects(existing, file_data, factor) |
538 | 538 |
539 dir = os.path.dirname(path) | 539 dir = os.path.dirname(path) |
540 try: | 540 try: |
541 os.makedirs(dir) | 541 os.makedirs(dir) |
542 except OSError, e: | 542 except OSError, e: |
543 if e.errno != errno.EEXIST: | 543 if e.errno != errno.EEXIST: |
544 raise | 544 raise |
545 | 545 |
546 with codecs.open(path, "wb", encoding="utf-8") as fileobj: | 546 with codecs.open(path, 'wb', encoding='utf-8') as fileobj: |
547 json.dump(existing, fileobj, indent=2, sort_keys=True) | 547 json.dump(existing, fileobj, indent=2, sort_keys=True) |
548 | 548 |
549 | 549 |
550 def parse_source(factor, lock, (mirror_name, server_type, log_file)): | 550 def parse_source(factor, lock, (mirror_name, server_type, log_file)): |
551 try: | 551 try: |
552 geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMOR
Y_CACHE) | 552 geo = pygeoip.GeoIP(get_config().get('stats', 'geoip_db'), pygeoip.MEMOR
Y_CACHE) |
553 geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.M
EMORY_CACHE) | 553 geov6 = pygeoip.GeoIP(get_config().get('stats', 'geoipv6_db'), pygeoip.M
EMORY_CACHE) |
554 | 554 |
555 ignored = set() | 555 ignored = set() |
556 fileobj = StatsFile(log_file) | 556 fileobj = StatsFile(log_file) |
557 try: | 557 try: |
558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored) | 558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored) |
559 finally: | 559 finally: |
560 fileobj.close() | 560 fileobj.close() |
561 | 561 |
562 lock.acquire() | 562 lock.acquire() |
563 try: | 563 try: |
564 save_stats(server_type, data, factor) | 564 save_stats(server_type, data, factor) |
565 finally: | 565 finally: |
566 lock.release() | 566 lock.release() |
567 return log_file, ignored | 567 return log_file, ignored |
568 except: | 568 except: |
569 print >>sys.stderr, "Unable to process log file '%s'" % log_file | 569 print >>sys.stderr, "Unable to process log file '%s'" % log_file |
570 traceback.print_exc() | 570 traceback.print_exc() |
571 return None, None | 571 return None, None |
572 | 572 |
573 | 573 |
574 def parse_sources(sources, factor=1, verbose=False): | 574 def parse_sources(sources, factor=1, verbose=False): |
575 pool = multiprocessing.Pool() | 575 pool = multiprocessing.Pool() |
576 lock = multiprocessing.Manager().Lock() | 576 lock = multiprocessing.Manager().Lock() |
577 callback = functools.partial(parse_source, factor, lock) | 577 callback = functools.partial(parse_source, factor, lock) |
578 try: | 578 try: |
579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz
e=1): | 579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz
e=1): |
580 if verbose and ignored: | 580 if verbose and ignored: |
581 print "Ignored files for %s" % log_file | 581 print 'Ignored files for %s' % log_file |
582 print "=========================================================
===" | 582 print '=========================================================
===' |
583 print "\n".join(sorted(ignored)) | 583 print '\n'.join(sorted(ignored)) |
584 finally: | 584 finally: |
585 pool.close() | 585 pool.close() |
586 | 586 |
587 if __name__ == "__main__": | 587 if __name__ == '__main__': |
588 setupStderr() | 588 setupStderr() |
589 | 589 |
590 parser = argparse.ArgumentParser(description="Processes log files and merges
them into the stats database") | 590 parser = argparse.ArgumentParser(description='Processes log files and merges
them into the stats database') |
591 parser.add_argument("--verbose", dest="verbose", action="store_const", const
=True, default=False, help="Verbose mode, ignored requests will be listed") | 591 parser.add_argument('--verbose', dest='verbose', action='store_const', const
=True, default=False, help='Verbose mode, ignored requests will be listed') |
592 parser.add_argument("--revert", dest="factor", action="store_const", const=-
1, default=1, help="Remove log data from the database") | 592 parser.add_argument('--revert', dest='factor', action='store_const', const=-
1, default=1, help='Remove log data from the database') |
593 parser.add_argument("mirror_name", nargs="?", help="Name of the mirror serve
r that the file belongs to") | 593 parser.add_argument('mirror_name', nargs='?', help='Name of the mirror serve
r that the file belongs to') |
594 parser.add_argument("server_type", nargs="?", help="Server type like downloa
d, update or subscription") | 594 parser.add_argument('server_type', nargs='?', help='Server type like downloa
d, update or subscription') |
595 parser.add_argument("log_file", nargs="?", help="Log file path, can be a loc
al file path, http:// or ssh:// URL") | 595 parser.add_argument('log_file', nargs='?', help='Log file path, can be a loc
al file path, http:// or ssh:// URL') |
596 args = parser.parse_args() | 596 args = parser.parse_args() |
597 | 597 |
598 if args.mirror_name and args.server_type and args.log_file: | 598 if args.mirror_name and args.server_type and args.log_file: |
599 sources = [(args.mirror_name, args.server_type, args.log_file)] | 599 sources = [(args.mirror_name, args.server_type, args.log_file)] |
600 else: | 600 else: |
601 sources = get_stats_files() | 601 sources = get_stats_files() |
602 parse_sources(sources, args.factor, args.verbose) | 602 parse_sources(sources, args.factor, args.verbose) |
OLD | NEW |