From e6764fe6281cd5131e15cd0370c43d25b9094ed0 Mon Sep 17 00:00:00 2001
From: Johnshall <xjlhq0096@gmail.com>
Date: Thu, 9 Dec 2021 20:11:01 +0800
Subject: [PATCH] Remove top50.py and back to top500.py. Just  memo.

---
 factory/top50.py  | 113 -------------------------------
 factory/top500.py | 168 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+), 113 deletions(-)
 delete mode 100644 factory/top50.py
 create mode 100644 factory/top500.py

diff --git a/factory/top50.py b/factory/top50.py
deleted file mode 100644
index 904edec..0000000
--- a/factory/top50.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from bs4 import BeautifulSoup
-import threading
-import time
-import sys
-import requests
-import re
-
-'''
-# 更新说明：
-
-# 由于原引 http://alexa.chinaz.com/Global/index.html 仅提供前 50 榜单，且难以爬取，
-# 我们因此改为从 www.similarweb.com 中爬去世界前 50 榜单
-# 由于世界前 500 网站列表无法以免费的方式呈现，所以改为较有代表性的前 50 榜单
-
-urls = ['http://alexa.chinaz.com/Global/index.html']
-for i in range(2,21):
-    urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i)
-'''
-
-
-urls = 'https://www.similarweb.com/zh/top-websites/united-states/?utm_source=addon&utm_medium=chrome&utm_content=overview&utm_campaign=country-rank'
-
-urls_scan_over = False
-
-domains = []
-
-domains_proxy = []
-domains_direct = []
-
-requests_header = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
-    'Cache-Control': 'max-age=0',
-    'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
-    'Connection': 'keep-alive'
-}
-
-
-def getTop(urls):
-    global requests_header
-    r = requests.get(url = 'https://www.similarweb.com/zh/top-websites/united-states/?utm_source=addon&utm_medium=chrome&utm_content=overview&utm_campaign=country-rank', 
-    headers=requests_header)
-    soup = BeautifulSoup(r.text, "lxml")
-    namesDom = soup.select("span.topRankingGrid-titleName")
-
-    for name in namesDom:
-        domains.append(name.string)
-    
-    print('{:-^30}'.format('We get!'))
-    print('{:-^30}'.format('Top50 Fetching over'))
-    print('\n')
-    print('\n\n')
-
-
-# Start
-print('{:-^30}'.format('Top50 Script Starting'))
-print('\n')
-getTop(urls)
-
-# thread to visit websites
-while len(domains):
-    domain = domains.pop(0)
-    is_proxy = False
-
-    try:
-        requests.get('http://www.' + domain, timeout=10, headers=requests_header)
-    except BaseException:
-        try:
-            requests.get('http://' + domain, timeout=10, headers=requests_header)
-        except BaseException:
-            is_proxy = True
-
-    if is_proxy:
-        domains_proxy.append(domain)
-        print(domain + "is proxy\n")
-    else:
-        domains_direct.append(domain)
-        print(domain + "is direct\n")
-
-    print('[Doamins Remain: %d]\tProxy %s：%s\n' % (len(domains), is_proxy, domain) )
-
-
-# 将苹果IP加入直连
-# 由于本脚本应当运行在内部环境中，可能无法访问Github，故改用staticdn.net提供的CDN节点
-# r = requests.get(url="https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf", headers=requests_header)
-print('{:-^30}'.format('将苹果IP加入直连'))
-print('\n\n')
-r = requests.get(url='https://raw.staticdn.net/felixonmars/dnsmasq-china-list/master/apple.china.conf', headers=requests_header)
-for url in r.text.split("\n")[:-1]:
-    url = re.sub(r'(server=\/)', '', url)   # 清除前缀
-    url = re.sub(r'(/114.114.114.114)', '', url)   # 清除后缀
-    domains_direct.append(url)
-
-# write files
-file_proxy = open('resultant/top50_proxy.list', 'w', encoding='utf-8')
-file_direct = open('resultant/top50_direct.list', 'w', encoding='utf-8')
-
-now_time = time.strftime("%Y-%m-%d %H:%M:%S")
-file_proxy.write('# top50 proxy list update time: ' + now_time + '\n')
-file_direct.write('# top50 direct list update time: ' + now_time + '\n')
-
-domains_direct = list( set(domains_direct) )
-domains_proxy  = list( set(domains_proxy) )
-domains_direct.sort()
-domains_proxy.sort()
-
-for domain in domains_direct:
-    file_direct.write(domain+'\n')
-for domain in domains_proxy:
-    file_proxy.write(domain+'\n')
-
-print('{:-^30}'.format('Done!'))
diff --git a/factory/top500.py b/factory/top500.py
new file mode 100644
index 0000000..b0538c3
--- /dev/null
+++ b/factory/top500.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+
+'''
+#警告：
+由于站长之家已不再免费提供全球top500名单，此脚本已无法使用
+这里仅仅是做备份，待可靠top500名单源时将会重新发布
+'''
+
+from bs4 import BeautifulSoup
+import threading
+import time
+import sys
+import requests
+import re
+
+urls = ['http://alexa.chinaz.com/Global/index.html']
+for i in range(2,21):
+    urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i)
+
+urls_scan_over = False
+
+domains = []
+
+domains_proxy = []
+domains_direct = []
+
+
+# thread to scan pages in urls
+class UrlScaner(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+
+    def run(self):
+        global urls_scan_over, urls
+
+        done_num = 0
+
+        while len(urls):
+            html = self.fetchHTML( urls.pop(0) )
+            self.praseHTML(html)
+
+            done_num = done_num + 25
+            print('top500 List Got: %d/500'%done_num)
+
+            time.sleep(1)
+
+        urls_scan_over = True
+        print('top500 List Fetched Over.')
+
+
+    def fetchHTML(self, url):
+        success = False
+        try_times = 0
+        r = None
+        while try_times < 5 and not success:
+            r = requests.get(url)
+            if r.status_code != 200:
+                time.sleep(1)
+                try_times = try_times + 1
+            else:
+                success = True
+                break
+
+        if not success:
+            sys.exit('error in request %s\n\treturn code: %d' % (url, r.status_code) )
+
+        r.encoding = 'utf-8'
+        return r.text
+
+
+    def praseHTML(self, html):
+        soup = BeautifulSoup(html, "lxml")
+        namesDom = soup.select("div.righttxt h3 span")
+
+        for name in namesDom:
+            domains.append(name.string)
+
+
+requests_header = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
+    'Cache-Control': 'max-age=0',
+    'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
+    'Connection': 'keep-alive'
+}
+
+
+# thread to visit websites
+class DomainScaner(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+
+    def run(self):
+        while not urls_scan_over or len(domains):
+            if len(domains) == 0:
+                time.sleep(2)
+                continue
+
+            domain = domains.pop(0)
+
+            if domain.endswith('.cn'):
+                continue
+            if 'google' in domain:
+                continue
+
+            is_proxy = False
+
+            try:
+                requests.get('http://www.' + domain, timeout=10, headers=requests_header)
+            except BaseException:
+                try:
+                    requests.get('http://' + domain, timeout=10, headers=requests_header)
+                except BaseException:
+                    is_proxy = True
+
+            if is_proxy:
+                domains_proxy.append(domain)
+            else:
+                domains_direct.append(domain)
+
+            print('[Doamins Remain: %d]\tProxy %s：%s' % (len(domains), is_proxy, domain) )
+
+        global scaner_thread_num
+        scaner_thread_num -= 1
+
+
+print('top500 Script Starting...\n\n')
+
+# Start Thread
+UrlScaner().start()
+scaner_thread_num = 0
+for i in range(3):
+    DomainScaner().start()
+    scaner_thread_num += 1
+
+# wait thread done
+while scaner_thread_num:
+    pass
+
+# 将苹果IP加入直连
+# 由于本脚本应当运行在内部环境中，可能无法访问Github，故改用staticdn.net提供的CDN节点
+# r = requests.get(url="https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf", headers=requests_header)
+print('{:-^30}'.format('将苹果IP加入直连'))
+print('\n\n')
+r = requests.get(url='https://raw.staticdn.net/felixonmars/dnsmasq-china-list/master/apple.china.conf', headers=requests_header)
+for url in r.text.split("\n")[:-1]:
+    url = re.sub(r'(server=\/)', '', url)   # 清除前缀
+    url = re.sub(r'(/114.114.114.114)', '', url)   # 清除后缀
+    domains_direct.append(url)
+
+# write files
+file_proxy = open('resultant/top50_proxy.list', 'w', encoding='utf-8')
+file_direct = open('resultant/top50_direct.list', 'w', encoding='utf-8')
+
+now_time = time.strftime("%Y-%m-%d %H:%M:%S")
+file_proxy.write('# top50 proxy list update time: ' + now_time + '\n')
+file_direct.write('# top50 direct list update time: ' + now_time + '\n')
+
+domains_direct = list( set(domains_direct) )
+domains_proxy  = list( set(domains_proxy) )
+domains_direct.sort()
+domains_proxy.sort()
+
+for domain in domains_direct:
+    file_direct.write(domain+'\n')
+for domain in domains_proxy:
+    file_proxy.write(domain+'\n')
+
+print('{:-^30}'.format('Done!'))