Files
Johnshall-Shadowrocket-ADBl…/factory/ad.py
2018-01-11 17:45:10 +08:00

96 lines
2.2 KiB
Python

# -*- coding: utf-8 -*-
#
# 提取广告规则,并且只提取对全域禁止的那种规则
#
import time
import sys
import requests
import re
rules_url = [
# EasyList China
#'https://easylist-downloads.adblockplus.org/easylistchina.txt',
# EasyList + China
'https://easylist-downloads.adblockplus.org/easylistchina+easylist.txt',
# 乘风 广告过滤规则
'https://raw.githubusercontent.com/xinggsf/Adblock-Plus-Rule/master/ABP-FX.txt'
]
# contain both domains and ips
domains = []
for rule_url in rules_url:
print('loading... ' + rule_url)
# get rule text
success = False
try_times = 0
r = None
while try_times < 5 and not success:
r = requests.get(rule_url)
if r.status_code != 200:
time.sleep(1)
try_times = try_times + 1
else:
success = True
break
if not success:
sys.exit('error in request %s\n\treturn code: %d' % (rule_url, r.status_code) )
rule = r.text
# parse rule
rule = rule.split('\n')
for row in rule:
row = row.strip()
# 直接跳过
if row.startswith('!') or row.startswith('@@') or "$" in row:
continue
# 清除前缀
row = re.sub(r'^\|?https?:\/\/', '', row)
row = re.sub(r'^\|\|', '', row)
row = row.lstrip('.*')
# 清除后缀
row = row.rstrip('/^*')
# 不能含有的字符
if re.search(r'[\/\^:\*]', row):
continue
# 只匹配域名或 IP
if re.match(r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,9}$', row) or re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', row):
domains.append(row)
print('done.')
# write into files
domains.sort()
file_ad = sys.stdout
try:
if sys.version_info.major == 3:
file_ad = open('resultant/ad.list', 'w', encoding='utf-8')
else:
file_ad = open('resultant/ad.list', 'w')
except:
pass
file_ad.write('# adblock rules refresh time: ' + time.strftime("%Y-%m-%d %H:%M:%S") + '\n')
last = ''
for item in domains:
if last == item:
continue
file_ad.write(item + '\n')
last = item