更好地解析广告规则

This commit is contained in:
Moshel
2018-01-11 17:45:10 +08:00
parent 88592804d3
commit fb878a4c81
12 changed files with 1822 additions and 173 deletions

View File

@@ -41,6 +41,7 @@
脚本,运行所需时间较长。自动爬取生成 `top500_*.list` 文件。
-----------------------------------
**resultant/ad.list**
@@ -48,5 +49,5 @@
**ad.py**
脚本,从指定的 Adblock Rule 中提取广告服务器的域名和 IP 至 `ad_*.list` 文件。
脚本,从指定的 Adblock Rule 中提取广告服务器的域名和 IP 至 `ad.list` 文件。

View File

@@ -1,5 +1,9 @@
# -*- coding: utf-8 -*-
#
# 提取广告规则,并且只提取对全域禁止的那种规则
#
import time
import sys
import requests
@@ -40,27 +44,30 @@ for rule_url in rules_url:
rule = r.text
# parse html
# parse rule
rule = rule.split('\n')
for row in rule:
if not row.startswith('||') and not row.startswith('|http'):
row = row.strip()
# 直接跳过
if row.startswith('!') or row.startswith('@@') or "$" in row:
continue
# del prefix
row = re.sub(r'^\|(\||https?:\/\/)', '', row)
# del suffix
row = row.rstrip('/^ ')
# 清除前缀
row = re.sub(r'^\|?https?:\/\/', '', row)
row = re.sub(r'^\|\|', '', row)
row = row.lstrip('.*')
if re.search(r'[\$\^:\*]', row):
continue
if row.count('/'):
# 清除后缀
row = row.rstrip('/^*')
# 不能含有的字符
if re.search(r'[\/\^:\*]', row):
continue
if not re.match(r'\w+(\.\w+)+$', row):
continue
# match
domains.append(row)
# 只匹配域名或 IP
if re.match(r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,9}$', row) or re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', row):
domains.append(row)
print('done.')

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
# top500 direct list update time: 2017-11-26 15:20:47
# top500 direct list update time: 2018-01-11 17:42:31
123movies.to
163.com
1688.com
@@ -19,7 +19,6 @@ adexchangeprediction.com
adf.ly
adnetworkperformance.com
adobe.com
airbnb.com
alibaba.com
aliexpress.com
alipay.com
@@ -48,7 +47,6 @@ baidu.com
baike.com
bankofamerica.com
battle.net
bbc.co.uk
bbc.com
behance.net
bestbuy.com
@@ -61,7 +59,6 @@ bitauto.com
blackboard.com
blastingnews.com
blkget.com
bongacams.com
booking.com
box.com
bukalapak.com
@@ -89,6 +86,7 @@ daum.net
dcinside.com
dell.com
detail.tmall.com
detik.com
deviantart.com
dictionary.com
digikala.com
@@ -129,7 +127,6 @@ giphy.com
github.com
github.io
gizmodo.com
globaloffers.link
globo.com
gmx.net
go.com
@@ -137,6 +134,7 @@ godaddy.com
goo.ne.jp
goodreads.com
groupon.com
gsmarena.com
hao123.com
haosou.com
hatena.ne.jp
@@ -191,6 +189,7 @@ mashable.com
mediafire.com
mediawhirl.net
mega.nz
mercadolibre.com.ar
mercadolivre.com.br
mi.com
microsoft.com
@@ -222,7 +221,6 @@ oracle.com
orange.fr
ouo.io
outbrain.com
ozock.com
pandora.com
paypal.com
paytm.com
@@ -273,7 +271,6 @@ spotscenered.info
stackexchange.com
stackoverflow.com
state.gov
steamcommunity.com
steampowered.com
subscene.com
taboola.com

View File

@@ -1,7 +1,9 @@
# top500 proxy list update time: 2017-11-26 15:20:47
# top500 proxy list update time: 2018-01-11 17:42:31
4shared.com
airbnb.com
archive.org
ask.com
bbc.co.uk
beeg.com
bet365.com
blog.jp
@@ -11,11 +13,11 @@ blogspot.com.br
blogspot.in
blogspot.jp
bloomberg.com
bongacams.com
bp.blogspot.com
chaturbate.com
cloudfront.net
dailymotion.com
detik.com
disqus.com
doubleclick.net
dropbox.com
@@ -28,8 +30,8 @@ fbcdn.net
fc2.com
files.wordpress.com
flipkart.com
globaloffers.link
goo.gl
gsmarena.com
hclips.com
hootsuite.com
hurriyet.com.tr
@@ -38,10 +40,10 @@ livedoor.jp
ltn.com.tw
media.tumblr.com
medium.com
mercadolibre.com.ar
messenger.com
nyaa.se
nytimes.com
ozock.com
pinterest.com
pixnet.net
pornhub.com
@@ -53,6 +55,7 @@ scribd.com
shutterstock.com
slideshare.net
spankbang.com
steamcommunity.com
t.co
telegram.org
thepiratebay.org