super updates with python

2025-12-18 07:44:57 +08:00 · 2017-06-24 00:06:50 +08:00
parent 17fe952165
commit f0342113e0
22 changed files with 5633 additions and 0 deletions
--- a/factory/top500.py
+++ b/factory/top500.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+
+from bs4 import BeautifulSoup
+import threading
+import time
+import sys
+import requests
+
+
+urls = ['http://alexa.chinaz.com/Global/index.html']
+for i in range(2,21):
+    urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i)
+
+urls_scan_over = False
+
+domains = []
+
+
+# thread to scan pages in urls
+class UrlScaner(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+
+    def run(self):
+        global urls_scan_over, urls
+
+        done_num = 0
+
+        while len(urls):
+            html = self.fetchHTML( urls.pop(0) )
+            self.praseHTML(html)
+
+            done_num = done_num + 25
+            print('top500 已获取：%d/500'%done_num)
+
+            time.sleep(1)
+
+        urls_scan_over = True
+        print('top500 网站获取完毕。')
+
+
+    def fetchHTML(self, url):
+        success = False
+        try_times = 0
+        while try_times < 5 and not success:
+            r = requests.get(url)
+            if r.status_code != 200:
+                time.sleep(1)
+                try_times = try_times + 1
+            else:
+                success = True
+                break
+
+        if not success:
+            sys.exit('error in request %s\n\treturn code: %d' % (url, r.status_code) )
+
+        r.encoding = 'utf-8'
+        return r.text
+
+
+    def praseHTML(self, html):
+        soup = BeautifulSoup(html, "lxml")
+        namesDom = soup.select("div.righttxt h3 span")
+
+        for name in namesDom:
+            domains.append(name.string)
+
+
+requests_header = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
+    'Cache-Control': 'max-age=0',
+    'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
+    'Connection': 'keep-alive'
+}
+
+
+# thread to visit websites
+class DomainScaner(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+
+    def run(self):
+        while not urls_scan_over or len(domains):
+            if len(domains) == 0:
+                time.sleep(2)
+                continue
+
+            domain = domains.pop(0)
+
+            if domain.endswith('.cn'):
+                continue
+            if 'google' in domain:
+                continue
+
+            is_proxy = False
+
+            try:
+                requests.get('http://' + domain, timeout=10, headers=requests_header)
+            except BaseException:
+                try:
+                    requests.get('http://www.' + domain, timeout=10, headers=requests_header)
+                except BaseException:
+                    is_proxy = True
+
+            if is_proxy:
+                file_proxy.write(domain + '\n')
+            else:
+                file_direct.write(domain + '\n')
+
+            print('[剩余域名数量：%d]\tProxy %s：%s' % (len(domains), is_proxy, domain) )
+
+
+print('5s later to start refresh top500 lists...')
+time.sleep(5)
+
+# output files
+file_proxy = open('resultant/top500_proxy.list', 'w', encoding='utf-8')
+file_direct = open('resultant/top500_direct.list', 'w', encoding='utf-8')
+
+now_time = '# list refresh time: ' + time.strftime("%Y-%m-%d %H:%M:%S")
+file_proxy.write(now_time + '\n')
+file_direct.write(now_time + '\n')
+
+# Start Thread
+UrlScaner().start()
+for i in range(5):
+    DomainScaner().start()