更好地解析广告规则

2025-12-17 23:34:48 +08:00 · 2018-01-11 17:45:10 +08:00
parent 88592804d3
commit fb878a4c81
12 changed files with 1822 additions and 173 deletions
--- a/factory/ad.py
+++ b/factory/ad.py
@@ -1,5 +1,9 @@
 # -*- coding: utf-8 -*-

+#
+# 提取广告规则，并且只提取对全域禁止的那种规则
+#
+
 import time
 import sys
 import requests
@@ -40,27 +44,30 @@ for rule_url in rules_url:

    rule = r.text

-    # parse html
+    # parse rule
    rule = rule.split('\n')
    for row in rule:
-        if not row.startswith('||') and not row.startswith('|http'):
+        row = row.strip()
+
+        # 直接跳过
+        if row.startswith('!') or row.startswith('@@') or "$" in row:
            continue

-        # del prefix
-        row = re.sub(r'^\|(\||https?:\/\/)', '', row)
-        # del suffix
-        row = row.rstrip('/^ ')
+        # 清除前缀
+        row = re.sub(r'^\|?https?:\/\/', '', row)
+        row = re.sub(r'^\|\|', '', row)
+        row = row.lstrip('.*')

-        if re.search(r'[\$\^:\*]', row):
-            continue
-        if row.count('/'):
+        # 清除后缀
+        row = row.rstrip('/^*')
+
+        # 不能含有的字符
+        if re.search(r'[\/\^:\*]', row):
            continue

-        if not re.match(r'\w+(\.\w+)+$', row):
-            continue
-
-        # match
-        domains.append(row)
+        # 只匹配域名或 IP
+        if re.match(r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,9}$', row) or re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', row):
+            domains.append(row)

    print('done.')