Index: sitescripts/crawler/bin/import_filters.py |
=================================================================== |
new file mode 100644 |
--- /dev/null |
+++ b/sitescripts/crawler/bin/import_filters.py |
@@ -0,0 +1,95 @@ |
+# coding: utf-8 |
+ |
+# This Source Code is subject to the terms of the Mozilla Public License |
+# version 2.0 (the "License"). You can obtain a copy of the License at |
+# http://mozilla.org/MPL/2.0/. |
+ |
+import MySQLdb, os, re |
+from sitescripts.utils import cached, get_config |
+ |
+@cached(600) |
+def _get_db(): |
+ database = get_config().get("crawler", "database") |
+ dbuser = get_config().get("crawler", "dbuser") |
+ dbpasswd = get_config().get("crawler", "dbpassword") |
+ if os.name == "nt": |
+ return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
+ use_unicode=True, charset="utf8", named_pipe=True) |
+ else: |
+ return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
+ use_unicode=True, charset="utf8") |
+ |
+def _get_cursor(): |
+ return _get_db().cursor(MySQLdb.cursors.DictCursor) |
+ |
+def _parse_hide_filter(line): |
+ match = re.search(r"^(.*?)##", line) |
+ return match.group(1).split(",") if match else [] |
+ |
+def _parse_block_filter(line): |
+ match = re.search(r"domain=(.*)", line) |
+ if match: |
+ return match.group(1).split("|") |
+ |
+ match = re.search(r"^\|\|(.*?)[/\^]", line) |
+ return [match.group(1)] if match else [] |
+ |
+def _remove_comment(line): |
+ exclamation_index = line.find("!") |
+ if exclamation_index != -1: |
+ return line[:exclamation_index] |
+ return line |
+ |
+def _parse_filters(filter_path, parse_filter): |
+ filters = {} |
+ |
+ try: |
+ for line in open(filter_path): |
+ line = line.strip() |
+ line = _remove_comment(line) |
+ |
+ domains = parse_filter(line) |
+ for domain in domains: |
+ filters[line] = domain; |
+ |
+ except IOError: |
+ print >>sys.stderr, "Unable to read filters from '%s'" % file_path |
+ |
+ return filters |
+ |
+def _extract_filters(easylist_dir): |
+ filter_files = {"easylist_specific_block.txt": _parse_block_filter, |
+ "easylist_specific_hide.txt": _parse_hide_filter} |
+ filters = {} |
+ for filter_file, parse_filter in filter_files.iteritems(): |
+ filter_path = easylist_dir + "/easylist/" + filter_file |
+ filters.update(_parse_filters(filter_path, parse_filter)) |
+ return filters |
+ |
+def _insert_filters(filters): |
+ cursor = _get_cursor() |
+ filter_insert = """ |
+INSERT INTO crawler_filters (filter, filter_hash) VALUES (%s, sha1(filter))""" |
+ domain_select = "SELECT id FROM crawler_domains WHERE domain = %s" |
+ domain_insert = "INSERT INTO crawler_domains (domain) VALUES (%s)" |
+ domain_filter_insert = """ |
+INSERT INTO crawler_domain_filters (filter, domain) VALUES (%s, %s)""" |
+ |
+ for filter_line, domain in filters.iteritems(): |
+ cursor.execute(filter_insert, filter_line) |
+ filter_id = cursor.lastrowid |
+ |
+ cursor.execute(domain_select, domain) |
+ result = cursor.fetchone() |
+ if result: |
+ domain_id = result["id"] |
+ else: |
+ cursor.execute(domain_insert, domain) |
+ domain_id = cursor.lastrowid |
+ |
+ cursor.execute(domain_filter_insert, (domain_id, filter_id)) |
+ |
+if __name__ == "__main__": |
+ easylist_dir = get_config().get("crawler", "easylist_repository") |
+ filters = _extract_filters(easylist_dir) |
+ _insert_filters(filters) |