更好地解析广告规则

This commit is contained in:
Moshel
2018-01-11 17:45:10 +08:00
parent 88592804d3
commit fb878a4c81
12 changed files with 1822 additions and 173 deletions

View File

@@ -41,6 +41,7 @@
脚本,运行所需时间较长。自动爬取生成 `top500_*.list` 文件。
-----------------------------------
**resultant/ad.list**
@@ -48,5 +49,5 @@
**ad.py**
脚本,从指定的 Adblock Rule 中提取广告服务器的域名和 IP 至 `ad_*.list` 文件。
脚本,从指定的 Adblock Rule 中提取广告服务器的域名和 IP 至 `ad.list` 文件。

View File

@@ -1,5 +1,9 @@
# -*- coding: utf-8 -*-
#
# 提取广告规则,并且只提取对全域禁止的那种规则
#
import time
import sys
import requests
@@ -40,26 +44,29 @@ for rule_url in rules_url:
rule = r.text
# parse html
# parse rule
rule = rule.split('\n')
for row in rule:
if not row.startswith('||') and not row.startswith('|http'):
row = row.strip()
# 直接跳过
if row.startswith('!') or row.startswith('@@') or "$" in row:
continue
# del prefix
row = re.sub(r'^\|(\||https?:\/\/)', '', row)
# del suffix
row = row.rstrip('/^ ')
# 清除前缀
row = re.sub(r'^\|?https?:\/\/', '', row)
row = re.sub(r'^\|\|', '', row)
row = row.lstrip('.*')
if re.search(r'[\$\^:\*]', row):
continue
if row.count('/'):
# 清除后缀
row = row.rstrip('/^*')
# 不能含有的字符
if re.search(r'[\/\^:\*]', row):
continue
if not re.match(r'\w+(\.\w+)+$', row):
continue
# match
# 只匹配域名或 IP
if re.match(r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,9}$', row) or re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', row):
domains.append(row)
print('done.')

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
# top500 direct list update time: 2017-11-26 15:20:47
# top500 direct list update time: 2018-01-11 17:42:31
123movies.to
163.com
1688.com
@@ -19,7 +19,6 @@ adexchangeprediction.com
adf.ly
adnetworkperformance.com
adobe.com
airbnb.com
alibaba.com
aliexpress.com
alipay.com
@@ -48,7 +47,6 @@ baidu.com
baike.com
bankofamerica.com
battle.net
bbc.co.uk
bbc.com
behance.net
bestbuy.com
@@ -61,7 +59,6 @@ bitauto.com
blackboard.com
blastingnews.com
blkget.com
bongacams.com
booking.com
box.com
bukalapak.com
@@ -89,6 +86,7 @@ daum.net
dcinside.com
dell.com
detail.tmall.com
detik.com
deviantart.com
dictionary.com
digikala.com
@@ -129,7 +127,6 @@ giphy.com
github.com
github.io
gizmodo.com
globaloffers.link
globo.com
gmx.net
go.com
@@ -137,6 +134,7 @@ godaddy.com
goo.ne.jp
goodreads.com
groupon.com
gsmarena.com
hao123.com
haosou.com
hatena.ne.jp
@@ -191,6 +189,7 @@ mashable.com
mediafire.com
mediawhirl.net
mega.nz
mercadolibre.com.ar
mercadolivre.com.br
mi.com
microsoft.com
@@ -222,7 +221,6 @@ oracle.com
orange.fr
ouo.io
outbrain.com
ozock.com
pandora.com
paypal.com
paytm.com
@@ -273,7 +271,6 @@ spotscenered.info
stackexchange.com
stackoverflow.com
state.gov
steamcommunity.com
steampowered.com
subscene.com
taboola.com

View File

@@ -1,7 +1,9 @@
# top500 proxy list update time: 2017-11-26 15:20:47
# top500 proxy list update time: 2018-01-11 17:42:31
4shared.com
airbnb.com
archive.org
ask.com
bbc.co.uk
beeg.com
bet365.com
blog.jp
@@ -11,11 +13,11 @@ blogspot.com.br
blogspot.in
blogspot.jp
bloomberg.com
bongacams.com
bp.blogspot.com
chaturbate.com
cloudfront.net
dailymotion.com
detik.com
disqus.com
doubleclick.net
dropbox.com
@@ -28,8 +30,8 @@ fbcdn.net
fc2.com
files.wordpress.com
flipkart.com
globaloffers.link
goo.gl
gsmarena.com
hclips.com
hootsuite.com
hurriyet.com.tr
@@ -38,10 +40,10 @@ livedoor.jp
ltn.com.tw
media.tumblr.com
medium.com
mercadolibre.com.ar
messenger.com
nyaa.se
nytimes.com
ozock.com
pinterest.com
pixnet.net
pornhub.com
@@ -53,6 +55,7 @@ scribd.com
shutterstock.com
slideshare.net
spankbang.com
steamcommunity.com
t.co
telegram.org
thepiratebay.org

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
# Best Shadowrocket Rules (https://github.com/h2y/Shadowrocket-ADBlock-Rules)
# by Moshel
# build time: 2017-11-26 16:22:26
# build time: 2018-01-11 17:42:43
[General]
bypass-system = true
@@ -51,10 +51,12 @@ IP-CIDR,34.224.0.0/12,Proxy
# 手工定义的 Direct 列表
# top500 proxy list update time: 2017-11-26 15:20:47
# top500 proxy list update time: 2018-01-11 17:42:31
DOMAIN-SUFFIX,4shared.com,Proxy
DOMAIN-SUFFIX,airbnb.com,Proxy
DOMAIN-SUFFIX,archive.org,Proxy
DOMAIN-SUFFIX,ask.com,Proxy
DOMAIN-SUFFIX,bbc.co.uk,Proxy
DOMAIN-SUFFIX,beeg.com,Proxy
DOMAIN-SUFFIX,bet365.com,Proxy
DOMAIN-SUFFIX,blog.jp,Proxy
@@ -64,11 +66,11 @@ DOMAIN-SUFFIX,blogspot.com.br,Proxy
DOMAIN-SUFFIX,blogspot.in,Proxy
DOMAIN-SUFFIX,blogspot.jp,Proxy
DOMAIN-SUFFIX,bloomberg.com,Proxy
DOMAIN-SUFFIX,bongacams.com,Proxy
DOMAIN-SUFFIX,bp.blogspot.com,Proxy
DOMAIN-SUFFIX,chaturbate.com,Proxy
DOMAIN-SUFFIX,cloudfront.net,Proxy
DOMAIN-SUFFIX,dailymotion.com,Proxy
DOMAIN-SUFFIX,detik.com,Proxy
DOMAIN-SUFFIX,disqus.com,Proxy
DOMAIN-SUFFIX,doubleclick.net,Proxy
DOMAIN-SUFFIX,dropbox.com,Proxy
@@ -81,8 +83,8 @@ DOMAIN-SUFFIX,fbcdn.net,Proxy
DOMAIN-SUFFIX,fc2.com,Proxy
DOMAIN-SUFFIX,files.wordpress.com,Proxy
DOMAIN-SUFFIX,flipkart.com,Proxy
DOMAIN-SUFFIX,globaloffers.link,Proxy
DOMAIN-SUFFIX,goo.gl,Proxy
DOMAIN-SUFFIX,gsmarena.com,Proxy
DOMAIN-SUFFIX,hclips.com,Proxy
DOMAIN-SUFFIX,hootsuite.com,Proxy
DOMAIN-SUFFIX,hurriyet.com.tr,Proxy
@@ -91,10 +93,10 @@ DOMAIN-SUFFIX,livedoor.jp,Proxy
DOMAIN-SUFFIX,ltn.com.tw,Proxy
DOMAIN-SUFFIX,media.tumblr.com,Proxy
DOMAIN-SUFFIX,medium.com,Proxy
DOMAIN-SUFFIX,mercadolibre.com.ar,Proxy
DOMAIN-SUFFIX,messenger.com,Proxy
DOMAIN-SUFFIX,nyaa.se,Proxy
DOMAIN-SUFFIX,nytimes.com,Proxy
DOMAIN-SUFFIX,ozock.com,Proxy
DOMAIN-SUFFIX,pinterest.com,Proxy
DOMAIN-SUFFIX,pixnet.net,Proxy
DOMAIN-SUFFIX,pornhub.com,Proxy
@@ -106,6 +108,7 @@ DOMAIN-SUFFIX,scribd.com,Proxy
DOMAIN-SUFFIX,shutterstock.com,Proxy
DOMAIN-SUFFIX,slideshare.net,Proxy
DOMAIN-SUFFIX,spankbang.com,Proxy
DOMAIN-SUFFIX,steamcommunity.com,Proxy
DOMAIN-SUFFIX,t.co,Proxy
DOMAIN-SUFFIX,telegram.org,Proxy
DOMAIN-SUFFIX,thepiratebay.org,Proxy

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
# Best Shadowrocket Rules (https://github.com/h2y/Shadowrocket-ADBlock-Rules)
# by Moshel
# build time: 2017-11-26 16:22:26
# build time: 2018-01-11 17:42:43
[General]
bypass-system = true
@@ -51,7 +51,7 @@ IP-CIDR,34.224.0.0/12,Proxy
# 手工定义的 Direct 列表
# top500 direct list update time: 2017-11-26 15:20:47
# top500 direct list update time: 2018-01-11 17:42:31
DOMAIN-SUFFIX,123movies.to,Direct
DOMAIN-SUFFIX,163.com,Direct
DOMAIN-SUFFIX,1688.com,Direct
@@ -72,7 +72,6 @@ DOMAIN-SUFFIX,adexchangeprediction.com,Direct
DOMAIN-SUFFIX,adf.ly,Direct
DOMAIN-SUFFIX,adnetworkperformance.com,Direct
DOMAIN-SUFFIX,adobe.com,Direct
DOMAIN-SUFFIX,airbnb.com,Direct
DOMAIN-SUFFIX,alibaba.com,Direct
DOMAIN-SUFFIX,aliexpress.com,Direct
DOMAIN-SUFFIX,alipay.com,Direct
@@ -101,7 +100,6 @@ DOMAIN-SUFFIX,baidu.com,Direct
DOMAIN-SUFFIX,baike.com,Direct
DOMAIN-SUFFIX,bankofamerica.com,Direct
DOMAIN-SUFFIX,battle.net,Direct
DOMAIN-SUFFIX,bbc.co.uk,Direct
DOMAIN-SUFFIX,bbc.com,Direct
DOMAIN-SUFFIX,behance.net,Direct
DOMAIN-SUFFIX,bestbuy.com,Direct
@@ -114,7 +112,6 @@ DOMAIN-SUFFIX,bitauto.com,Direct
DOMAIN-SUFFIX,blackboard.com,Direct
DOMAIN-SUFFIX,blastingnews.com,Direct
DOMAIN-SUFFIX,blkget.com,Direct
DOMAIN-SUFFIX,bongacams.com,Direct
DOMAIN-SUFFIX,booking.com,Direct
DOMAIN-SUFFIX,box.com,Direct
DOMAIN-SUFFIX,bukalapak.com,Direct
@@ -142,6 +139,7 @@ DOMAIN-SUFFIX,daum.net,Direct
DOMAIN-SUFFIX,dcinside.com,Direct
DOMAIN-SUFFIX,dell.com,Direct
DOMAIN-SUFFIX,detail.tmall.com,Direct
DOMAIN-SUFFIX,detik.com,Direct
DOMAIN-SUFFIX,deviantart.com,Direct
DOMAIN-SUFFIX,dictionary.com,Direct
DOMAIN-SUFFIX,digikala.com,Direct
@@ -182,7 +180,6 @@ DOMAIN-SUFFIX,giphy.com,Direct
DOMAIN-SUFFIX,github.com,Direct
DOMAIN-SUFFIX,github.io,Direct
DOMAIN-SUFFIX,gizmodo.com,Direct
DOMAIN-SUFFIX,globaloffers.link,Direct
DOMAIN-SUFFIX,globo.com,Direct
DOMAIN-SUFFIX,gmx.net,Direct
DOMAIN-SUFFIX,go.com,Direct
@@ -190,6 +187,7 @@ DOMAIN-SUFFIX,godaddy.com,Direct
DOMAIN-SUFFIX,goo.ne.jp,Direct
DOMAIN-SUFFIX,goodreads.com,Direct
DOMAIN-SUFFIX,groupon.com,Direct
DOMAIN-SUFFIX,gsmarena.com,Direct
DOMAIN-SUFFIX,hao123.com,Direct
DOMAIN-SUFFIX,haosou.com,Direct
DOMAIN-SUFFIX,hatena.ne.jp,Direct
@@ -244,6 +242,7 @@ DOMAIN-SUFFIX,mashable.com,Direct
DOMAIN-SUFFIX,mediafire.com,Direct
DOMAIN-SUFFIX,mediawhirl.net,Direct
DOMAIN-SUFFIX,mega.nz,Direct
DOMAIN-SUFFIX,mercadolibre.com.ar,Direct
DOMAIN-SUFFIX,mercadolivre.com.br,Direct
DOMAIN-SUFFIX,mi.com,Direct
DOMAIN-SUFFIX,microsoft.com,Direct
@@ -275,7 +274,6 @@ DOMAIN-SUFFIX,oracle.com,Direct
DOMAIN-SUFFIX,orange.fr,Direct
DOMAIN-SUFFIX,ouo.io,Direct
DOMAIN-SUFFIX,outbrain.com,Direct
DOMAIN-SUFFIX,ozock.com,Direct
DOMAIN-SUFFIX,pandora.com,Direct
DOMAIN-SUFFIX,paypal.com,Direct
DOMAIN-SUFFIX,paytm.com,Direct
@@ -326,7 +324,6 @@ DOMAIN-SUFFIX,spotscenered.info,Direct
DOMAIN-SUFFIX,stackexchange.com,Direct
DOMAIN-SUFFIX,stackoverflow.com,Direct
DOMAIN-SUFFIX,state.gov,Direct
DOMAIN-SUFFIX,steamcommunity.com,Direct
DOMAIN-SUFFIX,steampowered.com,Direct
DOMAIN-SUFFIX,subscene.com,Direct
DOMAIN-SUFFIX,taboola.com,Direct

File diff suppressed because it is too large Load Diff