更好地解析广告规则

This commit is contained in:
Moshel
2018-01-11 17:45:10 +08:00
parent 88592804d3
commit fb878a4c81
12 changed files with 1822 additions and 173 deletions

View File

@@ -1,5 +1,9 @@
# -*- coding: utf-8 -*-
#
# 提取广告规则,并且只提取对全域禁止的那种规则
#
import time
import sys
import requests
@@ -40,27 +44,30 @@ for rule_url in rules_url:
rule = r.text
# parse html
# parse rule
rule = rule.split('\n')
for row in rule:
if not row.startswith('||') and not row.startswith('|http'):
row = row.strip()
# 直接跳过
if row.startswith('!') or row.startswith('@@') or "$" in row:
continue
# del prefix
row = re.sub(r'^\|(\||https?:\/\/)', '', row)
# del suffix
row = row.rstrip('/^ ')
# 清除前缀
row = re.sub(r'^\|?https?:\/\/', '', row)
row = re.sub(r'^\|\|', '', row)
row = row.lstrip('.*')
if re.search(r'[\$\^:\*]', row):
continue
if row.count('/'):
# 清除后缀
row = row.rstrip('/^*')
# 不能含有的字符
if re.search(r'[\/\^:\*]', row):
continue
if not re.match(r'\w+(\.\w+)+$', row):
continue
# match
domains.append(row)
# 只匹配域名或 IP
if re.match(r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,9}$', row) or re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', row):
domains.append(row)
print('done.')