mirror of
https://github.com/Johnshall/Shadowrocket-ADBlock-Rules-Forever.git
synced 2025-12-17 07:18:33 +08:00
139 lines
3.2 KiB
Python
139 lines
3.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
#
|
||
# 提取广告规则,并且只提取对全域禁止的那种规则
|
||
#
|
||
|
||
# 参考 ADB 广告规则格式:https://adblockplus.org/filters
|
||
|
||
import time
|
||
import sys
|
||
import requests
|
||
import re
|
||
|
||
|
||
rules_url = [
|
||
# EasyList China
|
||
'https://easylist-downloads.adblockplus.org/easylistchina.txt',
|
||
# EasyList + China
|
||
'https://easylist-downloads.adblockplus.org/easylistchina+easylist.txt',
|
||
# 乘风 广告过滤规则
|
||
'https://raw.githubusercontent.com/xinggsf/Adblock-Plus-Rule/master/rule.txt',
|
||
# Peter Lowe 广告和隐私跟踪域名
|
||
'https://pgl.yoyo.org/adservers/serverlist.php?hostformat=adblockplus;showintro=0',
|
||
]
|
||
|
||
rule = ''
|
||
|
||
# contain both domains and ips
|
||
domains = []
|
||
|
||
|
||
for rule_url in rules_url:
|
||
print('loading... ' + rule_url)
|
||
|
||
# get rule text
|
||
success = False
|
||
try_times = 0
|
||
r = None
|
||
while try_times < 5 and not success:
|
||
r = requests.get(rule_url)
|
||
if r.status_code != 200:
|
||
time.sleep(1)
|
||
try_times = try_times + 1
|
||
else:
|
||
success = True
|
||
break
|
||
|
||
if not success:
|
||
sys.exit('error in request %s\n\treturn code: %d' % (rule_url, r.status_code) )
|
||
|
||
rule = rule + r.text + '\n'
|
||
|
||
# except rules
|
||
ignore = []
|
||
ad_ignore = sys.stdout
|
||
try:
|
||
if sys.version_info.major == 3:
|
||
ad_ignore = open('resultant/ad_ignore.list', 'r', encoding='utf-8')
|
||
else:
|
||
ad_ignore = open('resultant/ad_ignore.list', 'r')
|
||
except:
|
||
pass
|
||
for ig in ad_ignore.readlines():
|
||
if ig[0] != '#' and ig != '\n':
|
||
ignore.append(ig.strip())
|
||
|
||
# parse rule
|
||
rule = rule.split('\n')
|
||
for row in rule:
|
||
row = row.strip()
|
||
row0 = row
|
||
|
||
# 处理广告例外规则
|
||
|
||
if row.startswith('@@'):
|
||
i = 0
|
||
while i < len(domains):
|
||
domain = domains[i]
|
||
if domain in row:
|
||
del domains[i]
|
||
else:
|
||
i = i + 1
|
||
|
||
continue
|
||
|
||
|
||
# 处理广告黑名单规则
|
||
|
||
# 直接跳过
|
||
if row=='' or row.startswith('!') or "$" in row or "##" in row:
|
||
continue
|
||
|
||
# 清除前缀
|
||
row = re.sub(r'^\|?https?://', '', row)
|
||
row = re.sub(r'^\|\|', '', row)
|
||
row = row.lstrip('.*')
|
||
|
||
# 清除后缀
|
||
row = row.rstrip('/*')
|
||
if row[-1] == '^':
|
||
# row = '.'+row
|
||
row = row.rstrip('^')
|
||
row = re.sub(r':\d{2,5}$', '', row) # 清除端口
|
||
|
||
# 不能含有的字符
|
||
if re.search(r'[/^:*]', row):
|
||
print('ignore: '+row0)
|
||
continue
|
||
|
||
# 去除 ignore 中的内容
|
||
if row in ignore:
|
||
continue
|
||
|
||
# 只匹配域名或 IP
|
||
if re.match(r'^\.?[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})*\.[a-zA-Z0-9][-a-zA-Z0-9]{1,}$', row) or re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', row):
|
||
domains.append(row)
|
||
|
||
print('done.')
|
||
|
||
|
||
# write into files
|
||
|
||
file_ad = sys.stdout
|
||
try:
|
||
if sys.version_info.major == 3:
|
||
file_ad = open('resultant/ad.list', 'w', encoding='utf-8')
|
||
else:
|
||
file_ad = open('resultant/ad.list', 'w')
|
||
except:
|
||
pass
|
||
|
||
file_ad.write('# adblock rules refresh time: ' + time.strftime("%Y-%m-%d %H:%M:%S") + '\n')
|
||
|
||
domains = list( set(domains) )
|
||
domains.sort()
|
||
|
||
for item in domains:
|
||
file_ad.write(item + '\n')
|