Add multithreading

2025-12-17 15:24:43 +08:00 · 2022-02-07 02:01:27 +08:00
parent 80cd0bad8e
commit 725116bed2
1 changed files with 41 additions and 21 deletions
--- a/factory/top500_manual.py
+++ b/factory/top500_manual.py
@@ -3,13 +3,13 @@
 '''
 此脚本用于对 top500_manual.list 中网站进行评估，判断需要直连或代理
 该脚本应当在内网环境中运行
-TODO:并发
 '''

 import requests
 import time
+import threading

-# 读入 top500 列表
+# Read top500
 domains = [] 
 with open("resultant/top500_manual.list", "r", encoding='utf-8') as f:
    for domain in f.readlines():
@@ -17,17 +17,23 @@ with open("resultant/top500_manual.list", "r", encoding='utf-8') as f:
            continue
        domains.append(domain[:-1])

-# 判断直连或代理
 domains_proxy = []
 domains_direct = []
-
-def UrlScaner(domain):
-    requests_header = {
+requests_header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Cache-Control': 'max-age=0',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
    'Connection': 'keep-alive'
 }
+
+# thread to judge direct/proxy
+class DomainScaner(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+
+    def run(self):
+        while len(domains):
+            domain = domains.pop(0)
            is_proxy = False
            try:
                requests.get('http://www.' + domain, timeout=10, headers=requests_header)
@@ -42,10 +48,24 @@ def UrlScaner(domain):
            else:
                domains_direct.append(domain)

-    print('Proxy %s：%s' % (is_proxy, domain) )
+            print('[Doamins Remain: %d]\tProxy %s：%s' % (len(domains), is_proxy, domain) )

-for domain in domains:
-    UrlScaner(domain)
+
+        global scaner_thread_num
+        scaner_thread_num -= 1
+
+
+print('top500 Script Starting...\n\n')
+
+# Start Thread
+scaner_thread_num = 0
+for i in range(5):
+    DomainScaner().start()
+    scaner_thread_num += 1
+
+# wait thread done
+while scaner_thread_num:
+    pass


 # write files