OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # coding: utf-8 |
| 3 |
| 4 # This Source Code is subject to the terms of the Mozilla Public License |
| 5 # version 2.0 (the "License"). You can obtain a copy of the License at |
| 6 # http://mozilla.org/MPL/2.0/. |
| 7 |
| 8 """ |
| 9 Update the dictionaries in the rules |
| 10 ==================================== |
| 11 |
| 12 This script generates the dictionaries in the defaults/rules.js file based |
| 13 on various sources like the list of public suffixes (http://publicsuffix.org/)
. |
| 14 """ |
| 15 |
| 16 import sys |
| 17 import os |
| 18 import re |
| 19 import urllib |
| 20 import codecs |
| 21 import json |
| 22 import itertools |
| 23 |
| 24 schemes = { |
| 25 'http:': 4, |
| 26 'https:': 3, |
| 27 'ftp:': 2, |
| 28 'irc:': 1, |
| 29 } |
| 30 |
| 31 domainReferrals = { |
| 32 'amazon.co.uk': 'tag=uf07d-21', |
| 33 'amazon.com': 'tag=uf024-20', |
| 34 'amazon.de': 'tag=uf0e6-21', |
| 35 'amazon.fr': 'tag=uf02b-21', |
| 36 'amazon.es': 'tag=uf07-21', |
| 37 'amazon.it': 'tag=uf08d-21', |
| 38 'ozon.ru': 'partner=urlfixer', |
| 39 } |
| 40 |
| 41 additionalDomains = [ |
| 42 'fab.com', |
| 43 'ku.dk', |
| 44 'google.cz', |
| 45 'komplett.ie', |
| 46 'lotto.ie', |
| 47 'bt.yahoo.com', |
| 48 'o.co', |
| 49 'bet.hu', |
| 50 'haz.de', |
| 51 'sas.com', |
| 52 'nic.ir', |
| 53 'tomtop.com', |
| 54 'uwa.edu.au', |
| 55 'spacex.com', |
| 56 'eif.org', |
| 57 'geld.de', |
| 58 # From http://www.wikipedia.org/ |
| 59 'en.wikipedia.org', 'ja.wikipedia.org', 'de.wikipedia.org', 'es.wikipedia.org'
, 'ru.wikipedia.org', 'fr.wikipedia.org', 'it.wikipedia.org', 'pl.wikipedia.org'
, 'pt.wikipedia.org', 'zh.wikipedia.org', 'ar.wikipedia.org', 'bg.wikipedia.org'
, 'ca.wikipedia.org', 'cs.wikipedia.org', 'da.wikipedia.org', 'de.wikipedia.org'
, 'en.wikipedia.org', 'es.wikipedia.org', 'eo.wikipedia.org', 'eu.wikipedia.org'
, 'fa.wikipedia.org', 'fr.wikipedia.org', 'ko.wikipedia.org', 'hi.wikipedia.org'
, 'hr.wikipedia.org', 'id.wikipedia.org', 'it.wikipedia.org', 'he.wikipedia.org'
, 'lt.wikipedia.org', 'hu.wikipedia.org', 'ms.wikipedia.org', 'nl.wikipedia.org'
, 'ja.wikipedia.org', 'no.wikipedia.org', 'pl.wikipedia.org', 'pt.wikipedia.org'
, 'kk.wikipedia.org', 'ro.wikipedia.org', 'ru.wikipedia.org', 'sk.wikipedia.org'
, 'sl.wikipedia.org', 'sr.wikipedia.org', 'fi.wikipedia.org', 'sv.wikipedia.org'
, 'tr.wikipedia.org', 'uk.wikipedia.org', 'vi.wikipedia.org', 'vo.wikipedia.org'
, 'war.wikipedia.org', 'zh.wikipedia.org', 'af.wikipedia.org', 'als.wikipedia.or
g', 'am.wikipedia.org', 'an.wikipedia.org', 'ast.wikipedia.org', 'ht.wikipedia.o
rg', 'az.wikipedia.org', 'bn.wikipedia.org', 'ba.wikipedia.org', 'be.wikipedia.o
rg', 'bpy.wikipedia.org', 'bs.wikipedia.org', 'br.wikipedia.org', 'cv.wikipedia.
org', 'cy.wikipedia.org', 'et.wikipedia.org', 'el.wikipedia.org', 'fy.wikipedia.
org', 'ga.wikipedia.org', 'gl.wikipedia.org', 'gu.wikipedia.org', 'hy.wikipedia.
org', 'io.wikipedia.org', 'ia.wikipedia.org', 'is.wikipedia.org', 'jv.wikipedia.
org', 'kn.wikipedia.org', 'ka.wikipedia.org', 'ku.wikipedia.org', 'la.wikipedia.
org', 'lv.wikipedia.org', 'lb.wikipedia.org', 'lmo.wikipedia.org', 'mk.wikipedia
.org', 'mg.wikipedia.org', 'ml.wikipedia.org', 'mr.wikipedia.org', 'my.wikipedia
.org', 'new.wikipedia.org', 'ne.wikipedia.org', 'nn.wikipedia.org', 'nap.wikiped
ia.org', 'oc.wikipedia.org', 'pms.wikipedia.org', 'nds.wikipedia.org', 'qu.wikip
edia.org', 'pnb.wikipedia.org', 'sq.wikipedia.org', 'scn.wikipedia.org', 'simple
.wikipedia.org', 'ceb.wikipedia.org', 'sh.wikipedia.org', 'su.wikipedia.org', 's
w.wikipedia.org', 'tl.wikipedia.org', 'ta.wikipedia.org', 'tt.wikipedia.org', 't
e.wikipedia.org', 'th.wikipedia.org', 'bug.wikipedia.org', 'ur.wikipedia.org', '
wa.wikipedia.org', 'yo.wikipedia.org', 'diq.wikipedia.org', 'ace.wikipedia.org',
'frp.wikipedia.org', 'arc.wikipedia.org', 'gn.wikipedia.org', 'av.wikipedia.org
', 'ay.wikipedia.org', 'bjn.wikipedia.org', 'bh.wikipedia.org', 'bcl.wikipedia.o
rg', 'bar.wikipedia.org', 'bo.wikipedia.org', 'co.wikipedia.org', 'pdc.wikipedia
.org', 'dv.wikipedia.org', 'nv.wikipedia.org', 'ang.wikipedia.org', 'eml.wikiped
ia.org', 'myv.wikipedia.org', 'ext.wikipedia.org', 'hif.wikipedia.org', 'fo.wiki
pedia.org', 'frr.wikipedia.org', 'fur.wikipedia.org', 'gv.wikipedia.org', 'gag.w
ikipedia.org', 'gd.wikipedia.org', 'gan.wikipedia.org', 'glk.wikipedia.org', 'ha
k.wikipedia.org', 'xal.wikipedia.org', 'haw.wikipedia.org', 'hsb.wikipedia.org',
'ilo.wikipedia.org', 'ie.wikipedia.org', 'os.wikipedia.org', 'kl.wikipedia.org'
, 'pam.wikipedia.org', 'csb.wikipedia.org', 'kw.wikipedia.org', 'km.wikipedia.or
g', 'rw.wikipedia.org', 'kv.wikipedia.org', 'ky.wikipedia.org', 'mrj.wikipedia.o
rg', 'lad.wikipedia.org', 'lbe.wikipedia.org', 'lij.wikipedia.org', 'li.wikipedi
a.org', 'ln.wikipedia.org', 'jbo.wikipedia.org', 'mt.wikipedia.org', 'mi.wikiped
ia.org', 'xmf.wikipedia.org', 'arz.wikipedia.org', 'mzn.wikipedia.org', 'mdf.wik
ipedia.org', 'mn.wikipedia.org', 'nah.wikipedia.org', 'nrm.wikipedia.org', 'nov.
wikipedia.org', 'ce.wikipedia.org', 'mhr.wikipedia.org', 'or.wikipedia.org', 'as
.wikipedia.org', 'uz.wikipedia.org', 'pi.wikipedia.org', 'pag.wikipedia.org', 'p
a.wikipedia.org', 'pap.wikipedia.org', 'ps.wikipedia.org', 'koi.wikipedia.org',
'pfl.wikipedia.org', 'pcd.wikipedia.org', 'krc.wikipedia.org', 'crh.wikipedia.or
g', 'ksh.wikipedia.org', 'rm.wikipedia.org', 'rue.wikipedia.org', 'sa.wikipedia.
org', 'se.wikipedia.org', 'sc.wikipedia.org', 'sah.wikipedia.org', 'sco.wikipedi
a.org', 'stq.wikipedia.org', 'si.wikipedia.org', 'szl.wikipedia.org', 'so.wikipe
dia.org', 'ckb.wikipedia.org', 'tg.wikipedia.org', 'tpi.wikipedia.org', 'to.wiki
pedia.org', 'tk.wikipedia.org', 'udm.wikipedia.org', 'ug.wikipedia.org', 'vec.wi
kipedia.org', 'vls.wikipedia.org', 'wo.wikipedia.org', 'wuu.wikipedia.org', 'yi.
wikipedia.org', 'zea.wikipedia.org', 'kbd.wikipedia.org', 'ak.wikipedia.org', 'a
b.wikipedia.org', 'bm.wikipedia.org', 'bi.wikipedia.org', 'bxr.wikipedia.org', '
ch.wikipedia.org', 'ny.wikipedia.org', 'za.wikipedia.org', 'dsb.wikipedia.org',
'ee.wikipedia.org', 'ff.wikipedia.org', 'ki.wikipedia.org', 'got.wikipedia.org',
'ha.wikipedia.org', 'ig.wikipedia.org', 'iu.wikipedia.org', 'ik.wikipedia.org',
'ks.wikipedia.org', 'kg.wikipedia.org', 'lo.wikipedia.org', 'ltg.wikipedia.org'
, 'lg.wikipedia.org', 'cdo.wikipedia.org', 'mwl.wikipedia.org', 'mo.wikipedia.or
g', 'fj.wikipedia.org', 'na.wikipedia.org', 'cr.wikipedia.org', 'pih.wikipedia.o
rg', 'om.wikipedia.org', 'pnt.wikipedia.org', 'kaa.wikipedia.org', 'dz.wikipedia
.org', 'rmy.wikipedia.org', 'rn.wikipedia.org', 'sm.wikipedia.org', 'sg.wikipedi
a.org', 'st.wikipedia.org', 'nso.wikipedia.org', 'tn.wikipedia.org', 'sn.wikiped
ia.org', 'sd.wikipedia.org', 'cu.wikipedia.org', 'ss.wikipedia.org', 'srn.wikipe
dia.org', 'ty.wikipedia.org', 'kab.wikipedia.org', 'tet.wikipedia.org', 'ti.wiki
pedia.org', 'chr.wikipedia.org', 'tum.wikipedia.org', 'ts.wikipedia.org', 'chy.w
ikipedia.org', 've.wikipedia.org', 'tw.wikipedia.org', 'vep.wikipedia.org', 'xh.
wikipedia.org', 'zu.wikipedia.org', 'de.wikipedia.org', 'pl.wikipedia.org', 'ja.
wikipedia.org', 'zh.wikipedia.org', 'ru.wikipedia.org', 'eo.wikipedia.org', 'vi.
wikipedia.org', |
| 60 # From http://www.google.com/supported_domains |
| 61 'google.com', 'google.ad', 'google.ae', 'google.com.af', 'google.com.ag', 'goo
gle.com.ai', 'google.am', 'google.co.ao', 'google.com.ar', 'google.as', 'google.
at', 'google.com.au', 'google.az', 'google.ba', 'google.com.bd', 'google.be', 'g
oogle.bf', 'google.bg', 'google.com.bh', 'google.bi', 'google.bj', 'google.com.b
n', 'google.com.bo', 'google.com.br', 'google.bs', 'google.co.bw', 'google.by',
'google.com.bz', 'google.ca', 'google.cd', 'google.cf', 'google.cg', 'google.ch'
, 'google.ci', 'google.co.ck', 'google.cl', 'google.cm', 'google.cn', 'google.co
m.co', 'google.co.cr', 'google.com.cu', 'google.cv', 'google.com.cy', 'google.cz
', 'google.de', 'google.dj', 'google.dk', 'google.dm', 'google.com.do', 'google.
dz', 'google.com.ec', 'google.ee', 'google.com.eg', 'google.es', 'google.com.et'
, 'google.fi', 'google.com.fj', 'google.fm', 'google.fr', 'google.ga', 'google.g
e', 'google.gg', 'google.com.gh', 'google.com.gi', 'google.gl', 'google.gm', 'go
ogle.gp', 'google.gr', 'google.com.gt', 'google.gy', 'google.com.hk', 'google.hn
', 'google.hr', 'google.ht', 'google.hu', 'google.co.id', 'google.ie', 'google.c
o.il', 'google.im', 'google.co.in', 'google.iq', 'google.is', 'google.it', 'goog
le.je', 'google.com.jm', 'google.jo', 'google.co.jp', 'google.co.ke', 'google.co
m.kh', 'google.ki', 'google.kg', 'google.co.kr', 'google.com.kw', 'google.kz', '
google.la', 'google.com.lb', 'google.li', 'google.lk', 'google.co.ls', 'google.l
t', 'google.lu', 'google.lv', 'google.com.ly', 'google.co.ma', 'google.md', 'goo
gle.me', 'google.mg', 'google.mk', 'google.ml', 'google.mn', 'google.ms', 'googl
e.com.mt', 'google.mu', 'google.mv', 'google.mw', 'google.com.mx', 'google.com.m
y', 'google.co.mz', 'google.com.na', 'google.com.nf', 'google.com.ng', 'google.c
om.ni', 'google.ne', 'google.nl', 'google.no', 'google.com.np', 'google.nr', 'go
ogle.nu', 'google.co.nz', 'google.com.om', 'google.com.pa', 'google.com.pe', 'go
ogle.com.ph', 'google.com.pk', 'google.pl', 'google.pn', 'google.com.pr', 'googl
e.ps', 'google.pt', 'google.com.py', 'google.com.qa', 'google.ro', 'google.ru',
'google.rw', 'google.com.sa', 'google.com.sb', 'google.sc', 'google.se', 'google
.com.sg', 'google.sh', 'google.si', 'google.sk', 'google.com.sl', 'google.sn', '
google.so', 'google.sm', 'google.st', 'google.com.sv', 'google.td', 'google.tg',
'google.co.th', 'google.com.tj', 'google.tk', 'google.tl', 'google.tm', 'google
.tn', 'google.to', 'google.com.tr', 'google.tt', 'google.com.tw', 'google.co.tz'
, 'google.com.ua', 'google.co.ug', 'google.co.uk', 'google.com.uy', 'google.co.u
z', 'google.com.vc', 'google.co.ve', 'google.vg', 'google.co.vi', 'google.com.vn
', 'google.vu', 'google.ws', 'google.rs', 'google.co.za', 'google.co.zm', 'googl
e.co.zw', 'google.cat', |
| 62 # From http://www.ebay.ch/ (eBay-Websites) |
| 63 'mercadolibre.com.ar', 'ebay.com.au', 'ebay.at', 'ebay.be', 'mercadolivre.com.
br', 'ebay.com.cn', 'ebay.dk', 'ebay.de', 'ebay.fr', 'ebay.gr', 'ebay.co.uk', 'e
bay.com.hk', 'ebay.in', 'ebay.ie', 'ebay.it', 'ebay.ca', 'auction.co.kr', 'ebay.
com.my', 'mercadolibre.com.mx', 'pages.ebay.com', 'ebay.nl', 'ebay.no', 'ebay.ph
', 'ebay.pl', 'ebay.ru', 'ebay.se', 'ebay.com.sg', 'ebay.es', 'ruten.com.tw', 'e
bay.co.th', 'gittigidiyor.com', 'ebay.cz', 'ebay.com', 'ebay.vn', |
| 64 # From http://www.amazon.com/ (footer) |
| 65 'amazon.ca', 'amazon.cn', 'amazon.fr', 'amazon.de', 'amazon.it', 'amazon.co.jp
', 'amazon.es', 'amazon.co.uk', |
| 66 |
| 67 # From http://en.wikipedia.org/wiki/.gov#States_in_GOV |
| 68 'al.gov', 'alabama.gov', |
| 69 'alaska.gov', |
| 70 'az.gov', |
| 71 'ar.gov', 'arkansas.gov', |
| 72 'ca.gov', 'california.gov', |
| 73 'colorado.gov', |
| 74 'ct.gov', |
| 75 'delaware.gov', |
| 76 'florida.gov', 'fl.gov', |
| 77 'georgia.gov', 'ga.gov', |
| 78 'guam.gov', |
| 79 'hawaii.gov', |
| 80 'idaho.gov', |
| 81 'illinois.gov', |
| 82 'in.gov', |
| 83 'iowa.gov', 'ia.gov', |
| 84 'ks.gov', 'kansas.gov', |
| 85 'ky.gov', 'kentucky.gov', |
| 86 'louisiana.gov', |
| 87 'maine.gov', |
| 88 'maryland.gov', |
| 89 'mass.gov', |
| 90 'michigan.gov', |
| 91 'mn.gov', |
| 92 'mississippi.gov', |
| 93 'mo.gov', |
| 94 'mt.gov', 'montana.gov', |
| 95 'nebraska.gov', |
| 96 'nv.gov', |
| 97 'nh.gov', 'visitnh.gov', |
| 98 'nj.gov', 'newjersey.gov', |
| 99 'newmexico.gov', |
| 100 'ny.gov', |
| 101 'nc.gov', 'northcarolina.gov', |
| 102 'nd.gov', |
| 103 'ohio.gov', 'oh.gov', |
| 104 'ok.gov', |
| 105 'oregon.gov', |
| 106 'pa.gov', 'pennsylvania.gov', |
| 107 'pr.gov', |
| 108 'ri.gov', |
| 109 'sc.gov', |
| 110 'sd.gov', |
| 111 'tennessee.gov', 'tn.gov', |
| 112 'texas.gov', |
| 113 'utah.gov', |
| 114 'vermont.gov', |
| 115 'virginia.gov', |
| 116 'wa.gov', 'washington.gov', |
| 117 'wv.gov', |
| 118 'wisconsin.gov', |
| 119 'wyoming.gov', |
| 120 'dc.gov', |
| 121 ] |
| 122 |
| 123 additionalTLDs = [ |
| 124 # From http://en.wikipedia.org/wiki/.ar |
| 125 'com.ar', 'edu.ar', 'gob.ar', 'gov.ar', 'int.ar', 'mil.ar', 'net.ar', 'org.ar'
, 'tur.ar', |
| 126 |
| 127 # From http://en.wikipedia.org/wiki/.au |
| 128 'com.au', 'net.au', 'org.au', 'edu.au', 'gov.au', 'csiro.au', 'asn.au', 'id.au
', |
| 129 |
| 130 # From http://en.wikipedia.org/wiki/.bd |
| 131 'com.bd', 'edu.bd', 'ac.bd', 'net.bd', 'gov.bd', 'org.bd', 'mil.bd', |
| 132 |
| 133 # From http://en.wikipedia.org/wiki/.bn |
| 134 'com.bn', 'edu.bn', 'gov.bn', 'net.bn', 'org.bn', |
| 135 |
| 136 # From http://en.wikipedia.org/wiki/.ck |
| 137 'co.ck', 'org.ck', 'edu.ck', 'gov.ck', 'net.ck', 'gen.ck', 'biz.ck', 'info.ck'
, |
| 138 |
| 139 # From http://en.wikipedia.org/wiki/.cy |
| 140 'ac.cy', 'net.cy', 'gov.cy', 'org.cy', 'pro.cy', 'name.cy', 'ekloges.cy', |
| 141 'tm.cy', 'ltd.cy', 'biz.cy', 'press.cy', 'parliament.cy', 'com.cy', |
| 142 |
| 143 # From http://en.wikipedia.org/wiki/.er |
| 144 'com.er', 'edu.er', 'gov.er', 'mil.er', 'net.er', 'org.er', 'ind.er', |
| 145 |
| 146 # From http://en.wikipedia.org/wiki/.et |
| 147 'com.et', 'gov.et', 'org.et', 'edu.et', 'net.et', 'biz.et', 'name.et', 'info.e
t', |
| 148 |
| 149 # From http://en.wikipedia.org/wiki/.fj |
| 150 'ac.fj', 'biz.fj', 'com.fj', 'info.fj', 'mil.fj', 'name.fj', 'net.fj', 'org.fj
', 'pro.fj', |
| 151 |
| 152 # From http://en.wikipedia.org/wiki/.fk |
| 153 'co.fk', 'org.fk', 'gov.fk', 'ac.fk', 'nom.fk', 'net.fk', |
| 154 |
| 155 # From http://en.wikipedia.org/wiki/.gt |
| 156 'com.gt', 'edu.gt', 'net.gt', 'gob.gt', 'org.gt', 'mil.gt', 'ind.gt', |
| 157 |
| 158 # From http://en.wikipedia.org/wiki/.gu |
| 159 'com.gu', 'net.gu', 'gov.gu', 'org.gu', 'edu.gu', |
| 160 |
| 161 # From http://en.wikipedia.org/wiki/.il |
| 162 'ac.il', 'co.il', 'org.il', 'net.il', 'k12.il', 'gov.il', 'muni.il', 'idf.il', |
| 163 |
| 164 # From http://en.wikipedia.org/wiki/.jm |
| 165 'com.jm', 'net.jm', 'org.jm', 'edu.jm', 'gov.jm', 'mil.jm', |
| 166 |
| 167 # From http://en.wikipedia.org/wiki/.ke |
| 168 'co.ke', 'or.ke', 'ne.ke', 'go.ke', 'ac.ke', 'sc.ke', 'me.ke', 'mobi.ke', 'inf
o.ke', |
| 169 |
| 170 # From http://en.wikipedia.org/wiki/.kh |
| 171 'per.kh', 'com.kh', 'edu.kh', 'gov.kh', 'mil.kh', 'net.kh', 'org.kh', |
| 172 |
| 173 # From http://en.wikipedia.org/wiki/.kw |
| 174 'edu.kw', 'com.kw', 'net.kw', 'org.kw', 'gov.kw', |
| 175 |
| 176 # From http://en.wikipedia.org/wiki/.mm |
| 177 'net.mm', 'com.mm', 'edu.mm', 'org.mm', 'gov.mm', |
| 178 |
| 179 # From http://en.wikipedia.org/wiki/.mt |
| 180 'com.mt', 'org.mt', 'net.mt', 'edu.mt', 'gov.mt', |
| 181 |
| 182 # From http://en.wikipedia.org/wiki/.mz |
| 183 'adv.mz', 'ac.mz', 'co.mz', 'org.mz', 'gov.mz', 'edu.mz', |
| 184 |
| 185 # From http://en.wikipedia.org/wiki/.ni |
| 186 'gob.ni', 'co.ni', 'com.ni', 'ac.ni', 'edu.ni', 'org.ni', 'nom.ni', 'net.ni',
'mil.ni', |
| 187 |
| 188 # From http://en.wikipedia.org/wiki/.np |
| 189 'com.np', 'edu.np', 'gov.np', 'mil.np', 'net.np', 'org.np', |
| 190 |
| 191 # From http://en.wikipedia.org/wiki/.nz |
| 192 'ac.nz', 'co.nz', 'geek.nz', 'gen.nz', 'maori.nz', 'net.nz', 'org.nz', 'school
.nz', |
| 193 'cri.nz', 'govt.nz', 'iwi.nz', 'parliament.nz', 'mil.nz', 'health.nz', |
| 194 |
| 195 # From http://en.wikipedia.org/wiki/.om |
| 196 'com.om', 'co.om', 'edu.om', 'ac.om', 'sch.om', 'gov.om', 'net.om', 'org.om', |
| 197 'mil.om', 'museum.om', 'biz.om', 'pro.om', 'med.om', |
| 198 |
| 199 # From http://en.wikipedia.org/wiki/.pg |
| 200 'com.pg', 'net.pg', 'ac.pg', 'gov.pg', 'mil.pg', 'org.pg', |
| 201 |
| 202 # From http://en.wikipedia.org/wiki/.py |
| 203 'org.py', 'edu.py', 'mil.py', 'gov.py', 'net.py', 'com.py', 'coop.py', |
| 204 |
| 205 # From http://en.wikipedia.org/wiki/.qa |
| 206 'com.qa', 'net.qa', 'org.qa', 'gov.qa', 'edu.qa', 'mil.qa', 'name.qa', 'sch.qa
', |
| 207 |
| 208 # From http://en.wikipedia.org/wiki/.sv |
| 209 'edu.sv', 'gob.sv', 'com.sv', 'org.sv', 'red.sv', |
| 210 |
| 211 # From http://en.wikipedia.org/wiki/.tr |
| 212 'com.tr', 'gen.tr', 'org.tr', 'biz.tr', 'info.tr', 'av.tr', 'dr.tr', 'pol.tr', |
| 213 'bel.tr', 'tsk.tr', 'bbs.tr', 'k12.tr', 'edu.tr', 'name.tr', 'net.tr', 'gov.tr
', |
| 214 'web.tr', 'tel.tr', 'tv.tr', 'nc.tr', |
| 215 |
| 216 # From http://en.wikipedia.org/wiki/.uk |
| 217 'ac.uk', 'co.uk', 'gov.uk', 'judiciary.uk', 'ltd.uk', 'me.uk', 'mod.uk', 'net.
uk', |
| 218 'nhs.uk', 'nic.uk', 'org.uk', 'parliament.uk', 'plc.uk', 'police.uk', 'sch.uk
', |
| 219 |
| 220 # From http://en.wikipedia.org/wiki/.uy |
| 221 'com.uy', 'edu.uy', 'gub.uy', 'net.uy', 'mil.uy', 'org.uy', |
| 222 |
| 223 # From http://en.wikipedia.org/wiki/.ve |
| 224 'com.ve', 'net.ve', 'org.ve', 'info.ve', 'co.ve', 'web.ve', 'gob.ve', 'edu.ve'
, 'mil.ve', 'tec.ve', |
| 225 |
| 226 # From http://en.wikipedia.org/wiki/.ye |
| 227 'com.ye', 'co.ye', 'ltd.ye', 'me.ye', 'net.ye', 'org.ye', 'plc.ye', 'gov.ye', |
| 228 |
| 229 # From http://en.wikipedia.org/wiki/.za |
| 230 'ac.za', 'city.za', 'co.za', 'edu.za', 'gov.za', 'law.za', 'mil.za', 'nom.za',
'org.za', 'school.za', |
| 231 'alt.za', 'net.work.za', 'ngo.za', 'tm.za', 'web.za', 'bourse.za', |
| 232 'agric.za', 'cybernet.za', 'grondar.za', 'iaccess.za', 'inca.za', 'nis.za', 'o
livetti.za', 'pix.za', |
| 233 |
| 234 # From http://en.wikipedia.org/wiki/.zm |
| 235 'ac.zm', 'co.zm', 'com.zm', 'edu.zm', 'gov.zm', 'net.zm', 'org.zm', 'sch.zm', |
| 236 |
| 237 # From http://en.wikipedia.org/wiki/.zw |
| 238 'co.zw', 'ac.zw', 'org.zw', |
| 239 ] |
| 240 |
| 241 def getSuffixes(target, items): |
| 242 suffixes = {} |
| 243 for item, priority in items.iteritems(): |
| 244 suffix = item[-1] if len(item) else '' |
| 245 if not suffix in suffixes: |
| 246 suffixes[suffix] = {} |
| 247 suffixes[suffix][item[:-1]] = priority |
| 248 for suffix, items in suffixes.iteritems(): |
| 249 if len(items.keys()) == 1: |
| 250 item, priority = items.items()[0] |
| 251 target[suffix] = ''.join(reversed(item)) + ' ' + str(priority) |
| 252 else: |
| 253 target[suffix] = {} |
| 254 getSuffixes(target[suffix], items) |
| 255 |
| 256 def urlopen(url, attempts=3): |
| 257 """ |
| 258 Tries to open a particular URL, retries on failure. |
| 259 """ |
| 260 for i in range(attempts): |
| 261 try: |
| 262 return urllib.urlopen(url) |
| 263 except IOError, e: |
| 264 error = e |
| 265 time.sleep(5) |
| 266 raise error |
| 267 |
| 268 def iterateTLDs(): |
| 269 for tld in additionalTLDs: |
| 270 yield tld |
| 271 |
| 272 url = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld
_names.dat?raw=1' |
| 273 resource = urlopen(url) |
| 274 for line in resource.read().decode('utf-8').splitlines(): |
| 275 line = line.rstrip() |
| 276 if line.startswith("//"): |
| 277 continue |
| 278 |
| 279 if line.startswith('*.'): |
| 280 tld = line[2:] |
| 281 elif line.startswith('!'): |
| 282 tld = line[1:] |
| 283 else: |
| 284 tld = line |
| 285 |
| 286 if tld: |
| 287 yield tld |
| 288 |
| 289 def getTLDs(domains, tldPriority): |
| 290 for tld in iterateTLDs(): |
| 291 if not tld in domains: |
| 292 domains[tld] = tldPriority |
| 293 |
| 294 def updateSchemes(rules): |
| 295 rules['scheme'] = {} |
| 296 getSuffixes(rules['scheme'], schemes) |
| 297 |
| 298 def updateDomains(rules): |
| 299 domains = {} |
| 300 reader = codecs.getreader('utf-8')(sys.stdin) |
| 301 i = 0 |
| 302 for domain in itertools.chain(reader.readlines(), additionalDomains): |
| 303 domain = domain.rstrip() |
| 304 if not domain or domain in domains: |
| 305 continue |
| 306 domains[domain] = i |
| 307 i += 1 |
| 308 |
| 309 maxPriority = i |
| 310 for domain in domains.iterkeys(): |
| 311 domains[domain] = maxPriority - domains[domain] |
| 312 |
| 313 # Extract TLDs from domain list |
| 314 for domain, priority in domains.items(): |
| 315 while True: |
| 316 if not re.search(r'^[^.]+\.+', domain): |
| 317 break |
| 318 domain = re.sub(r'^[^.]+\.+', '', domain) |
| 319 if not domain: |
| 320 break |
| 321 if not domain in domains or domains[domain] < priority - maxPriority: |
| 322 domains[domain] = priority - maxPriority |
| 323 |
| 324 # Fill up with "official" TLDs |
| 325 getTLDs(domains, -maxPriority) |
| 326 |
| 327 rules['domain'] = {} |
| 328 getSuffixes(rules['domain'], domains) |
| 329 |
| 330 def writeRules(rules): |
| 331 path = os.path.join('defaults', 'rules.json') |
| 332 file = codecs.open(path, 'rb', encoding='utf-8') |
| 333 data = file.read() |
| 334 file.close() |
| 335 |
| 336 marker = '// Automatically generated dictionaries' |
| 337 markerIndex = data.find(marker) |
| 338 if markerIndex < 0: |
| 339 raise Exception('Insertion marker not found in %s' % path) |
| 340 data = data[0:markerIndex + len(marker)] + '\n' |
| 341 data += ' ' + json.dumps(rules, ensure_ascii=False, sort_keys=True, separator
s = (',', ':'))[1:-1] + '\n}\n' |
| 342 |
| 343 file = codecs.open(path, 'wb', encoding='utf-8') |
| 344 file.write(data) |
| 345 file.close() |
| 346 |
| 347 def updateRules(): |
| 348 rules = {} |
| 349 rules['domainReferrals'] = domainReferrals |
| 350 updateSchemes(rules) |
| 351 updateDomains(rules) |
| 352 writeRules(rules) |
| 353 |
| 354 if __name__ == "__main__": |
| 355 updateRules() |
OLD | NEW |