remove top500.py

2025-12-18 07:44:57 +08:00 · 2021-12-05 16:59:42 +08:00
parent 0fa95e8b0c
commit f44d921ec3
3 changed files with 177 additions and 160 deletions
--- a/factory/top500.py
+++ b/factory/top500.py
@@ -1,160 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from bs4 import BeautifulSoup
-import threading
-import time
-import sys
-import requests
-import re
-
-
-urls = ['http://alexa.chinaz.com/Global/index.html']
-for i in range(2,21):
-    urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i)
-
-urls_scan_over = False
-
-domains = []
-
-domains_proxy = []
-domains_direct = []
-
-
-# thread to scan pages in urls
-class UrlScaner(threading.Thread):
-    def __init__(self):
-        threading.Thread.__init__(self)
-
-    def run(self):
-        global urls_scan_over, urls
-
-        done_num = 0
-
-        while len(urls):
-            html = self.fetchHTML( urls.pop(0) )
-            self.praseHTML(html)
-
-            done_num = done_num + 25
-            print('top500 List Got: %d/500'%done_num)
-
-            time.sleep(1)
-
-        urls_scan_over = True
-        print('top500 List Fetched Over.')
-
-
-    def fetchHTML(self, url):
-        success = False
-        try_times = 0
-        r = None
-        while try_times < 5 and not success:
-            r = requests.get(url)
-            if r.status_code != 200:
-                time.sleep(1)
-                try_times = try_times + 1
-            else:
-                success = True
-                break
-
-        if not success:
-            sys.exit('error in request %s\n\treturn code: %d' % (url, r.status_code) )
-
-        r.encoding = 'utf-8'
-        return r.text
-
-
-    def praseHTML(self, html):
-        soup = BeautifulSoup(html, "lxml")
-        namesDom = soup.select("div.righttxt h3 span")
-
-        for name in namesDom:
-            domains.append(name.string)
-
-
-requests_header = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
-    'Cache-Control': 'max-age=0',
-    'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
-    'Connection': 'keep-alive'
-}
-
-
-# thread to visit websites
-class DomainScaner(threading.Thread):
-    def __init__(self):
-        threading.Thread.__init__(self)
-
-    def run(self):
-        while not urls_scan_over or len(domains):
-            if len(domains) == 0:
-                time.sleep(2)
-                continue
-
-            domain = domains.pop(0)
-
-            if domain.endswith('.cn'):
-                continue
-            if 'google' in domain:
-                continue
-
-            is_proxy = False
-
-            try:
-                requests.get('http://www.' + domain, timeout=10, headers=requests_header)
-            except BaseException:
-                try:
-                    requests.get('http://' + domain, timeout=10, headers=requests_header)
-                except BaseException:
-                    is_proxy = True
-
-            if is_proxy:
-                domains_proxy.append(domain)
-            else:
-                domains_direct.append(domain)
-
-            print('[Doamins Remain: %d]\tProxy %s：%s' % (len(domains), is_proxy, domain) )
-
-        global scaner_thread_num
-        scaner_thread_num -= 1
-
-
-print('top500 Script Starting...\n\n')
-
-# Start Thread
-UrlScaner().start()
-scaner_thread_num = 0
-for i in range(3):
-    DomainScaner().start()
-    scaner_thread_num += 1
-
-# wait thread done
-while scaner_thread_num:
-    pass
-
-# write files
-file_proxy = open('resultant/top500_proxy.list', 'w', encoding='utf-8')
-file_direct = open('resultant/top500_direct.list', 'w', encoding='utf-8')
-
-now_time = time.strftime("%Y-%m-%d %H:%M:%S")
-file_proxy.write('# top500 proxy list update time: ' + now_time + '\n')
-file_direct.write('# top500 direct list update time: ' + now_time + '\n')
-
-
-# 将苹果IP加入直连
-# 由于本脚本应当运行在内部环境中，可能无法访问Github，故改用staticdn.net提供的CDN节点
-# r = requests.get(url="https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf", headers=requests_header)
-r = requests.get(url='https://raw.staticdn.net/felixonmars/dnsmasq-china-list/master/apple.china.conf', headers=requests_header)
-for url in r.text.split("\n")[:-1]:
-    url = re.sub(r'(server=\/)', '', url)   # 清除前缀
-    url = re.sub(r'(/114.114.114.114)', '', url)   # 清除后缀
-    domains_direct.append(url)
-
-domains_direct = list( set(domains_direct) )
-domains_proxy  = list( set(domains_proxy) )
-domains_direct.sort()
-domains_proxy.sort()
-
-for domain in domains_direct:
-    file_direct.write(domain+'\n')
-for domain in domains_proxy:
-    file_proxy.write(domain+'\n')