更好地解析广告规则

This commit is contained in:
Moshel
2018-01-11 17:45:10 +08:00
parent 88592804d3
commit fb878a4c81
12 changed files with 1822 additions and 173 deletions

View File

@@ -41,6 +41,7 @@
脚本,运行所需时间较长。自动爬取生成 `top500_*.list` 文件。 脚本,运行所需时间较长。自动爬取生成 `top500_*.list` 文件。
-----------------------------------
**resultant/ad.list** **resultant/ad.list**
@@ -48,5 +49,5 @@
**ad.py** **ad.py**
脚本,从指定的 Adblock Rule 中提取广告服务器的域名和 IP 至 `ad_*.list` 文件。 脚本,从指定的 Adblock Rule 中提取广告服务器的域名和 IP 至 `ad.list` 文件。

View File

@@ -1,5 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
#
# 提取广告规则,并且只提取对全域禁止的那种规则
#
import time import time
import sys import sys
import requests import requests
@@ -40,26 +44,29 @@ for rule_url in rules_url:
rule = r.text rule = r.text
# parse html # parse rule
rule = rule.split('\n') rule = rule.split('\n')
for row in rule: for row in rule:
if not row.startswith('||') and not row.startswith('|http'): row = row.strip()
# 直接跳过
if row.startswith('!') or row.startswith('@@') or "$" in row:
continue continue
# del prefix # 清除前缀
row = re.sub(r'^\|(\||https?:\/\/)', '', row) row = re.sub(r'^\|?https?:\/\/', '', row)
# del suffix row = re.sub(r'^\|\|', '', row)
row = row.rstrip('/^ ') row = row.lstrip('.*')
if re.search(r'[\$\^:\*]', row): # 清除后缀
continue row = row.rstrip('/^*')
if row.count('/'):
# 不能含有的字符
if re.search(r'[\/\^:\*]', row):
continue continue
if not re.match(r'\w+(\.\w+)+$', row): # 只匹配域名或 IP
continue if re.match(r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,9}$', row) or re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', row):
# match
domains.append(row) domains.append(row)
print('done.') print('done.')

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
# top500 direct list update time: 2017-11-26 15:20:47 # top500 direct list update time: 2018-01-11 17:42:31
123movies.to 123movies.to
163.com 163.com
1688.com 1688.com
@@ -19,7 +19,6 @@ adexchangeprediction.com
adf.ly adf.ly
adnetworkperformance.com adnetworkperformance.com
adobe.com adobe.com
airbnb.com
alibaba.com alibaba.com
aliexpress.com aliexpress.com
alipay.com alipay.com
@@ -48,7 +47,6 @@ baidu.com
baike.com baike.com
bankofamerica.com bankofamerica.com
battle.net battle.net
bbc.co.uk
bbc.com bbc.com
behance.net behance.net
bestbuy.com bestbuy.com
@@ -61,7 +59,6 @@ bitauto.com
blackboard.com blackboard.com
blastingnews.com blastingnews.com
blkget.com blkget.com
bongacams.com
booking.com booking.com
box.com box.com
bukalapak.com bukalapak.com
@@ -89,6 +86,7 @@ daum.net
dcinside.com dcinside.com
dell.com dell.com
detail.tmall.com detail.tmall.com
detik.com
deviantart.com deviantart.com
dictionary.com dictionary.com
digikala.com digikala.com
@@ -129,7 +127,6 @@ giphy.com
github.com github.com
github.io github.io
gizmodo.com gizmodo.com
globaloffers.link
globo.com globo.com
gmx.net gmx.net
go.com go.com
@@ -137,6 +134,7 @@ godaddy.com
goo.ne.jp goo.ne.jp
goodreads.com goodreads.com
groupon.com groupon.com
gsmarena.com
hao123.com hao123.com
haosou.com haosou.com
hatena.ne.jp hatena.ne.jp
@@ -191,6 +189,7 @@ mashable.com
mediafire.com mediafire.com
mediawhirl.net mediawhirl.net
mega.nz mega.nz
mercadolibre.com.ar
mercadolivre.com.br mercadolivre.com.br
mi.com mi.com
microsoft.com microsoft.com
@@ -222,7 +221,6 @@ oracle.com
orange.fr orange.fr
ouo.io ouo.io
outbrain.com outbrain.com
ozock.com
pandora.com pandora.com
paypal.com paypal.com
paytm.com paytm.com
@@ -273,7 +271,6 @@ spotscenered.info
stackexchange.com stackexchange.com
stackoverflow.com stackoverflow.com
state.gov state.gov
steamcommunity.com
steampowered.com steampowered.com
subscene.com subscene.com
taboola.com taboola.com

View File

@@ -1,7 +1,9 @@
# top500 proxy list update time: 2017-11-26 15:20:47 # top500 proxy list update time: 2018-01-11 17:42:31
4shared.com 4shared.com
airbnb.com
archive.org archive.org
ask.com ask.com
bbc.co.uk
beeg.com beeg.com
bet365.com bet365.com
blog.jp blog.jp
@@ -11,11 +13,11 @@ blogspot.com.br
blogspot.in blogspot.in
blogspot.jp blogspot.jp
bloomberg.com bloomberg.com
bongacams.com
bp.blogspot.com bp.blogspot.com
chaturbate.com chaturbate.com
cloudfront.net cloudfront.net
dailymotion.com dailymotion.com
detik.com
disqus.com disqus.com
doubleclick.net doubleclick.net
dropbox.com dropbox.com
@@ -28,8 +30,8 @@ fbcdn.net
fc2.com fc2.com
files.wordpress.com files.wordpress.com
flipkart.com flipkart.com
globaloffers.link
goo.gl goo.gl
gsmarena.com
hclips.com hclips.com
hootsuite.com hootsuite.com
hurriyet.com.tr hurriyet.com.tr
@@ -38,10 +40,10 @@ livedoor.jp
ltn.com.tw ltn.com.tw
media.tumblr.com media.tumblr.com
medium.com medium.com
mercadolibre.com.ar
messenger.com messenger.com
nyaa.se nyaa.se
nytimes.com nytimes.com
ozock.com
pinterest.com pinterest.com
pixnet.net pixnet.net
pornhub.com pornhub.com
@@ -53,6 +55,7 @@ scribd.com
shutterstock.com shutterstock.com
slideshare.net slideshare.net
spankbang.com spankbang.com
steamcommunity.com
t.co t.co
telegram.org telegram.org
thepiratebay.org thepiratebay.org

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
# Best Shadowrocket Rules (https://github.com/h2y/Shadowrocket-ADBlock-Rules) # Best Shadowrocket Rules (https://github.com/h2y/Shadowrocket-ADBlock-Rules)
# by Moshel # by Moshel
# build time: 2017-11-26 16:22:26 # build time: 2018-01-11 17:42:43
[General] [General]
bypass-system = true bypass-system = true
@@ -51,10 +51,12 @@ IP-CIDR,34.224.0.0/12,Proxy
# 手工定义的 Direct 列表 # 手工定义的 Direct 列表
# top500 proxy list update time: 2017-11-26 15:20:47 # top500 proxy list update time: 2018-01-11 17:42:31
DOMAIN-SUFFIX,4shared.com,Proxy DOMAIN-SUFFIX,4shared.com,Proxy
DOMAIN-SUFFIX,airbnb.com,Proxy
DOMAIN-SUFFIX,archive.org,Proxy DOMAIN-SUFFIX,archive.org,Proxy
DOMAIN-SUFFIX,ask.com,Proxy DOMAIN-SUFFIX,ask.com,Proxy
DOMAIN-SUFFIX,bbc.co.uk,Proxy
DOMAIN-SUFFIX,beeg.com,Proxy DOMAIN-SUFFIX,beeg.com,Proxy
DOMAIN-SUFFIX,bet365.com,Proxy DOMAIN-SUFFIX,bet365.com,Proxy
DOMAIN-SUFFIX,blog.jp,Proxy DOMAIN-SUFFIX,blog.jp,Proxy
@@ -64,11 +66,11 @@ DOMAIN-SUFFIX,blogspot.com.br,Proxy
DOMAIN-SUFFIX,blogspot.in,Proxy DOMAIN-SUFFIX,blogspot.in,Proxy
DOMAIN-SUFFIX,blogspot.jp,Proxy DOMAIN-SUFFIX,blogspot.jp,Proxy
DOMAIN-SUFFIX,bloomberg.com,Proxy DOMAIN-SUFFIX,bloomberg.com,Proxy
DOMAIN-SUFFIX,bongacams.com,Proxy
DOMAIN-SUFFIX,bp.blogspot.com,Proxy DOMAIN-SUFFIX,bp.blogspot.com,Proxy
DOMAIN-SUFFIX,chaturbate.com,Proxy DOMAIN-SUFFIX,chaturbate.com,Proxy
DOMAIN-SUFFIX,cloudfront.net,Proxy DOMAIN-SUFFIX,cloudfront.net,Proxy
DOMAIN-SUFFIX,dailymotion.com,Proxy DOMAIN-SUFFIX,dailymotion.com,Proxy
DOMAIN-SUFFIX,detik.com,Proxy
DOMAIN-SUFFIX,disqus.com,Proxy DOMAIN-SUFFIX,disqus.com,Proxy
DOMAIN-SUFFIX,doubleclick.net,Proxy DOMAIN-SUFFIX,doubleclick.net,Proxy
DOMAIN-SUFFIX,dropbox.com,Proxy DOMAIN-SUFFIX,dropbox.com,Proxy
@@ -81,8 +83,8 @@ DOMAIN-SUFFIX,fbcdn.net,Proxy
DOMAIN-SUFFIX,fc2.com,Proxy DOMAIN-SUFFIX,fc2.com,Proxy
DOMAIN-SUFFIX,files.wordpress.com,Proxy DOMAIN-SUFFIX,files.wordpress.com,Proxy
DOMAIN-SUFFIX,flipkart.com,Proxy DOMAIN-SUFFIX,flipkart.com,Proxy
DOMAIN-SUFFIX,globaloffers.link,Proxy
DOMAIN-SUFFIX,goo.gl,Proxy DOMAIN-SUFFIX,goo.gl,Proxy
DOMAIN-SUFFIX,gsmarena.com,Proxy
DOMAIN-SUFFIX,hclips.com,Proxy DOMAIN-SUFFIX,hclips.com,Proxy
DOMAIN-SUFFIX,hootsuite.com,Proxy DOMAIN-SUFFIX,hootsuite.com,Proxy
DOMAIN-SUFFIX,hurriyet.com.tr,Proxy DOMAIN-SUFFIX,hurriyet.com.tr,Proxy
@@ -91,10 +93,10 @@ DOMAIN-SUFFIX,livedoor.jp,Proxy
DOMAIN-SUFFIX,ltn.com.tw,Proxy DOMAIN-SUFFIX,ltn.com.tw,Proxy
DOMAIN-SUFFIX,media.tumblr.com,Proxy DOMAIN-SUFFIX,media.tumblr.com,Proxy
DOMAIN-SUFFIX,medium.com,Proxy DOMAIN-SUFFIX,medium.com,Proxy
DOMAIN-SUFFIX,mercadolibre.com.ar,Proxy
DOMAIN-SUFFIX,messenger.com,Proxy DOMAIN-SUFFIX,messenger.com,Proxy
DOMAIN-SUFFIX,nyaa.se,Proxy DOMAIN-SUFFIX,nyaa.se,Proxy
DOMAIN-SUFFIX,nytimes.com,Proxy DOMAIN-SUFFIX,nytimes.com,Proxy
DOMAIN-SUFFIX,ozock.com,Proxy
DOMAIN-SUFFIX,pinterest.com,Proxy DOMAIN-SUFFIX,pinterest.com,Proxy
DOMAIN-SUFFIX,pixnet.net,Proxy DOMAIN-SUFFIX,pixnet.net,Proxy
DOMAIN-SUFFIX,pornhub.com,Proxy DOMAIN-SUFFIX,pornhub.com,Proxy
@@ -106,6 +108,7 @@ DOMAIN-SUFFIX,scribd.com,Proxy
DOMAIN-SUFFIX,shutterstock.com,Proxy DOMAIN-SUFFIX,shutterstock.com,Proxy
DOMAIN-SUFFIX,slideshare.net,Proxy DOMAIN-SUFFIX,slideshare.net,Proxy
DOMAIN-SUFFIX,spankbang.com,Proxy DOMAIN-SUFFIX,spankbang.com,Proxy
DOMAIN-SUFFIX,steamcommunity.com,Proxy
DOMAIN-SUFFIX,t.co,Proxy DOMAIN-SUFFIX,t.co,Proxy
DOMAIN-SUFFIX,telegram.org,Proxy DOMAIN-SUFFIX,telegram.org,Proxy
DOMAIN-SUFFIX,thepiratebay.org,Proxy DOMAIN-SUFFIX,thepiratebay.org,Proxy

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
# Best Shadowrocket Rules (https://github.com/h2y/Shadowrocket-ADBlock-Rules) # Best Shadowrocket Rules (https://github.com/h2y/Shadowrocket-ADBlock-Rules)
# by Moshel # by Moshel
# build time: 2017-11-26 16:22:26 # build time: 2018-01-11 17:42:43
[General] [General]
bypass-system = true bypass-system = true
@@ -51,7 +51,7 @@ IP-CIDR,34.224.0.0/12,Proxy
# 手工定义的 Direct 列表 # 手工定义的 Direct 列表
# top500 direct list update time: 2017-11-26 15:20:47 # top500 direct list update time: 2018-01-11 17:42:31
DOMAIN-SUFFIX,123movies.to,Direct DOMAIN-SUFFIX,123movies.to,Direct
DOMAIN-SUFFIX,163.com,Direct DOMAIN-SUFFIX,163.com,Direct
DOMAIN-SUFFIX,1688.com,Direct DOMAIN-SUFFIX,1688.com,Direct
@@ -72,7 +72,6 @@ DOMAIN-SUFFIX,adexchangeprediction.com,Direct
DOMAIN-SUFFIX,adf.ly,Direct DOMAIN-SUFFIX,adf.ly,Direct
DOMAIN-SUFFIX,adnetworkperformance.com,Direct DOMAIN-SUFFIX,adnetworkperformance.com,Direct
DOMAIN-SUFFIX,adobe.com,Direct DOMAIN-SUFFIX,adobe.com,Direct
DOMAIN-SUFFIX,airbnb.com,Direct
DOMAIN-SUFFIX,alibaba.com,Direct DOMAIN-SUFFIX,alibaba.com,Direct
DOMAIN-SUFFIX,aliexpress.com,Direct DOMAIN-SUFFIX,aliexpress.com,Direct
DOMAIN-SUFFIX,alipay.com,Direct DOMAIN-SUFFIX,alipay.com,Direct
@@ -101,7 +100,6 @@ DOMAIN-SUFFIX,baidu.com,Direct
DOMAIN-SUFFIX,baike.com,Direct DOMAIN-SUFFIX,baike.com,Direct
DOMAIN-SUFFIX,bankofamerica.com,Direct DOMAIN-SUFFIX,bankofamerica.com,Direct
DOMAIN-SUFFIX,battle.net,Direct DOMAIN-SUFFIX,battle.net,Direct
DOMAIN-SUFFIX,bbc.co.uk,Direct
DOMAIN-SUFFIX,bbc.com,Direct DOMAIN-SUFFIX,bbc.com,Direct
DOMAIN-SUFFIX,behance.net,Direct DOMAIN-SUFFIX,behance.net,Direct
DOMAIN-SUFFIX,bestbuy.com,Direct DOMAIN-SUFFIX,bestbuy.com,Direct
@@ -114,7 +112,6 @@ DOMAIN-SUFFIX,bitauto.com,Direct
DOMAIN-SUFFIX,blackboard.com,Direct DOMAIN-SUFFIX,blackboard.com,Direct
DOMAIN-SUFFIX,blastingnews.com,Direct DOMAIN-SUFFIX,blastingnews.com,Direct
DOMAIN-SUFFIX,blkget.com,Direct DOMAIN-SUFFIX,blkget.com,Direct
DOMAIN-SUFFIX,bongacams.com,Direct
DOMAIN-SUFFIX,booking.com,Direct DOMAIN-SUFFIX,booking.com,Direct
DOMAIN-SUFFIX,box.com,Direct DOMAIN-SUFFIX,box.com,Direct
DOMAIN-SUFFIX,bukalapak.com,Direct DOMAIN-SUFFIX,bukalapak.com,Direct
@@ -142,6 +139,7 @@ DOMAIN-SUFFIX,daum.net,Direct
DOMAIN-SUFFIX,dcinside.com,Direct DOMAIN-SUFFIX,dcinside.com,Direct
DOMAIN-SUFFIX,dell.com,Direct DOMAIN-SUFFIX,dell.com,Direct
DOMAIN-SUFFIX,detail.tmall.com,Direct DOMAIN-SUFFIX,detail.tmall.com,Direct
DOMAIN-SUFFIX,detik.com,Direct
DOMAIN-SUFFIX,deviantart.com,Direct DOMAIN-SUFFIX,deviantart.com,Direct
DOMAIN-SUFFIX,dictionary.com,Direct DOMAIN-SUFFIX,dictionary.com,Direct
DOMAIN-SUFFIX,digikala.com,Direct DOMAIN-SUFFIX,digikala.com,Direct
@@ -182,7 +180,6 @@ DOMAIN-SUFFIX,giphy.com,Direct
DOMAIN-SUFFIX,github.com,Direct DOMAIN-SUFFIX,github.com,Direct
DOMAIN-SUFFIX,github.io,Direct DOMAIN-SUFFIX,github.io,Direct
DOMAIN-SUFFIX,gizmodo.com,Direct DOMAIN-SUFFIX,gizmodo.com,Direct
DOMAIN-SUFFIX,globaloffers.link,Direct
DOMAIN-SUFFIX,globo.com,Direct DOMAIN-SUFFIX,globo.com,Direct
DOMAIN-SUFFIX,gmx.net,Direct DOMAIN-SUFFIX,gmx.net,Direct
DOMAIN-SUFFIX,go.com,Direct DOMAIN-SUFFIX,go.com,Direct
@@ -190,6 +187,7 @@ DOMAIN-SUFFIX,godaddy.com,Direct
DOMAIN-SUFFIX,goo.ne.jp,Direct DOMAIN-SUFFIX,goo.ne.jp,Direct
DOMAIN-SUFFIX,goodreads.com,Direct DOMAIN-SUFFIX,goodreads.com,Direct
DOMAIN-SUFFIX,groupon.com,Direct DOMAIN-SUFFIX,groupon.com,Direct
DOMAIN-SUFFIX,gsmarena.com,Direct
DOMAIN-SUFFIX,hao123.com,Direct DOMAIN-SUFFIX,hao123.com,Direct
DOMAIN-SUFFIX,haosou.com,Direct DOMAIN-SUFFIX,haosou.com,Direct
DOMAIN-SUFFIX,hatena.ne.jp,Direct DOMAIN-SUFFIX,hatena.ne.jp,Direct
@@ -244,6 +242,7 @@ DOMAIN-SUFFIX,mashable.com,Direct
DOMAIN-SUFFIX,mediafire.com,Direct DOMAIN-SUFFIX,mediafire.com,Direct
DOMAIN-SUFFIX,mediawhirl.net,Direct DOMAIN-SUFFIX,mediawhirl.net,Direct
DOMAIN-SUFFIX,mega.nz,Direct DOMAIN-SUFFIX,mega.nz,Direct
DOMAIN-SUFFIX,mercadolibre.com.ar,Direct
DOMAIN-SUFFIX,mercadolivre.com.br,Direct DOMAIN-SUFFIX,mercadolivre.com.br,Direct
DOMAIN-SUFFIX,mi.com,Direct DOMAIN-SUFFIX,mi.com,Direct
DOMAIN-SUFFIX,microsoft.com,Direct DOMAIN-SUFFIX,microsoft.com,Direct
@@ -275,7 +274,6 @@ DOMAIN-SUFFIX,oracle.com,Direct
DOMAIN-SUFFIX,orange.fr,Direct DOMAIN-SUFFIX,orange.fr,Direct
DOMAIN-SUFFIX,ouo.io,Direct DOMAIN-SUFFIX,ouo.io,Direct
DOMAIN-SUFFIX,outbrain.com,Direct DOMAIN-SUFFIX,outbrain.com,Direct
DOMAIN-SUFFIX,ozock.com,Direct
DOMAIN-SUFFIX,pandora.com,Direct DOMAIN-SUFFIX,pandora.com,Direct
DOMAIN-SUFFIX,paypal.com,Direct DOMAIN-SUFFIX,paypal.com,Direct
DOMAIN-SUFFIX,paytm.com,Direct DOMAIN-SUFFIX,paytm.com,Direct
@@ -326,7 +324,6 @@ DOMAIN-SUFFIX,spotscenered.info,Direct
DOMAIN-SUFFIX,stackexchange.com,Direct DOMAIN-SUFFIX,stackexchange.com,Direct
DOMAIN-SUFFIX,stackoverflow.com,Direct DOMAIN-SUFFIX,stackoverflow.com,Direct
DOMAIN-SUFFIX,state.gov,Direct DOMAIN-SUFFIX,state.gov,Direct
DOMAIN-SUFFIX,steamcommunity.com,Direct
DOMAIN-SUFFIX,steampowered.com,Direct DOMAIN-SUFFIX,steampowered.com,Direct
DOMAIN-SUFFIX,subscene.com,Direct DOMAIN-SUFFIX,subscene.com,Direct
DOMAIN-SUFFIX,taboola.com,Direct DOMAIN-SUFFIX,taboola.com,Direct

File diff suppressed because it is too large Load Diff