mirror of
https://github.com/Johnshall/Shadowrocket-ADBlock-Rules-Forever.git
synced 2025-12-17 07:18:33 +08:00
124 lines
2.8 KiB
Python
124 lines
2.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
# 下载并解析最新版本的 GFWList
|
|
# 对于混合性质的网站,尽量走代理(忽略了所有的@@指令)
|
|
#
|
|
# 从 https://github.com/Johnshall/cn-blocked-domain 中获取GFWList的补充
|
|
# 感谢 https://github.com/Loyalsoldier/cn-blocked-domain
|
|
#
|
|
|
|
|
|
import time
|
|
import requests
|
|
import re
|
|
import base64
|
|
|
|
|
|
unhandle_rules = []
|
|
|
|
# ruleType for raw or base64
|
|
def get_rule(rules_url, ruleType='raw'):
|
|
success = False
|
|
try_times = 0
|
|
r = None
|
|
while try_times < 5 and not success:
|
|
r = requests.get(rules_url)
|
|
if r.status_code != 200:
|
|
time.sleep(1)
|
|
try_times = try_times + 1
|
|
else:
|
|
success = True
|
|
break
|
|
|
|
if not success:
|
|
raise Exception('error in request %s\n\treturn code: %d' % (rules_url, r.status_code) )
|
|
|
|
if ruleType == 'base64':
|
|
rule = base64.b64decode(r.text) \
|
|
.decode("utf-8") \
|
|
.replace('\\n', '\n')
|
|
else:
|
|
rule = r.text
|
|
|
|
return rule
|
|
|
|
|
|
def clear_format(rule):
|
|
rules = []
|
|
|
|
rule = rule.split('\n')
|
|
for row in rule:
|
|
row = row.strip()
|
|
|
|
# 注释 直接跳过
|
|
if row == '' or row.startswith('!') or row.startswith('@@') or row.startswith('[AutoProxy'):
|
|
continue
|
|
|
|
# 清除前缀
|
|
row = re.sub(r'^\|?https?://', '', row)
|
|
row = re.sub(r'^\|\|', '', row)
|
|
row = row.lstrip('.*')
|
|
|
|
# 清除后缀
|
|
row = row.rstrip('/^*')
|
|
|
|
rules.append(row)
|
|
|
|
return rules
|
|
|
|
|
|
def filtrate_rules(rules, excludes=[]):
|
|
ret = []
|
|
|
|
for rule in rules:
|
|
rule0 = rule
|
|
|
|
# only hostname
|
|
if '/' in rule:
|
|
split_ret = rule.split('/')
|
|
rule = split_ret[0]
|
|
|
|
if not re.match('^[\w.-]+$', rule):
|
|
unhandle_rules.append(rule0)
|
|
continue
|
|
|
|
if rule in excludes:
|
|
continue
|
|
|
|
ret.append(rule)
|
|
|
|
ret = list( set(ret) )
|
|
ret.sort()
|
|
|
|
return ret
|
|
|
|
def getURLs(url):
|
|
r = requests.get(url)
|
|
return r.text.split("\n")[:-1]
|
|
|
|
# main
|
|
|
|
rule = get_rule(rules_url='https://raw.githubusercontent.com/gfwlist/gfwlist/master/gfwlist.txt', ruleType='base64')
|
|
# 从 https://github.com/Johnshall/cn-blocked-domain 中获取GFWList的补充
|
|
rule += get_rule('https://raw.githubusercontent.com/Johnshall/cn-blocked-domain/release/domains.txt')
|
|
|
|
rules = clear_format(rule)
|
|
|
|
excludes = []
|
|
with open('manual_gfwlist_excludes.txt', 'r', encoding='utf-8') as f:
|
|
for line in f.readlines():
|
|
if line[0] == "#" or line == "\n":
|
|
continue
|
|
excludes.append(line.strip())
|
|
|
|
rules = filtrate_rules(rules, excludes)
|
|
|
|
rules = list( set(rules) )
|
|
|
|
open('resultant/gfw.list', 'w', encoding='utf-8') \
|
|
.write('\n'.join(rules))
|
|
|
|
open('resultant/gfw_unhandle.log', 'w', encoding='utf-8') \
|
|
.write('\n'.join(unhandle_rules))
|