From e6764fe6281cd5131e15cd0370c43d25b9094ed0 Mon Sep 17 00:00:00 2001 From: Johnshall Date: Thu, 9 Dec 2021 20:11:01 +0800 Subject: [PATCH] Remove top50.py and back to top500.py. Just memo. --- factory/top50.py | 113 ------------------------------- factory/top500.py | 168 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 113 deletions(-) delete mode 100644 factory/top50.py create mode 100644 factory/top500.py diff --git a/factory/top50.py b/factory/top50.py deleted file mode 100644 index 904edec..0000000 --- a/factory/top50.py +++ /dev/null @@ -1,113 +0,0 @@ -# -*- coding: utf-8 -*- - -from bs4 import BeautifulSoup -import threading -import time -import sys -import requests -import re - -''' -# 更新说明: - -# 由于原引 http://alexa.chinaz.com/Global/index.html 仅提供前 50 榜单,且难以爬取, -# 我们因此改为从 www.similarweb.com 中爬去世界前 50 榜单 -# 由于世界前 500 网站列表无法以免费的方式呈现,所以改为较有代表性的前 50 榜单 - -urls = ['http://alexa.chinaz.com/Global/index.html'] -for i in range(2,21): - urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i) -''' - - -urls = 'https://www.similarweb.com/zh/top-websites/united-states/?utm_source=addon&utm_medium=chrome&utm_content=overview&utm_campaign=country-rank' - -urls_scan_over = False - -domains = [] - -domains_proxy = [] -domains_direct = [] - -requests_header = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', - 'Cache-Control': 'max-age=0', - 'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2', - 'Connection': 'keep-alive' -} - - -def getTop(urls): - global requests_header - r = requests.get(url = 'https://www.similarweb.com/zh/top-websites/united-states/?utm_source=addon&utm_medium=chrome&utm_content=overview&utm_campaign=country-rank', - headers=requests_header) - soup = BeautifulSoup(r.text, "lxml") - namesDom = soup.select("span.topRankingGrid-titleName") - - for name in namesDom: - domains.append(name.string) - - print('{:-^30}'.format('We get!')) - print('{:-^30}'.format('Top50 Fetching over')) - print('\n') - print('\n\n') - - -# Start -print('{:-^30}'.format('Top50 Script Starting')) -print('\n') -getTop(urls) - -# thread to visit websites -while len(domains): - domain = domains.pop(0) - is_proxy = False - - try: - requests.get('http://www.' + domain, timeout=10, headers=requests_header) - except BaseException: - try: - requests.get('http://' + domain, timeout=10, headers=requests_header) - except BaseException: - is_proxy = True - - if is_proxy: - domains_proxy.append(domain) - print(domain + "is proxy\n") - else: - domains_direct.append(domain) - print(domain + "is direct\n") - - print('[Doamins Remain: %d]\tProxy %s:%s\n' % (len(domains), is_proxy, domain) ) - - -# 将苹果IP加入直连 -# 由于本脚本应当运行在内部环境中,可能无法访问Github,故改用staticdn.net提供的CDN节点 -# r = requests.get(url="https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf", headers=requests_header) -print('{:-^30}'.format('将苹果IP加入直连')) -print('\n\n') -r = requests.get(url='https://raw.staticdn.net/felixonmars/dnsmasq-china-list/master/apple.china.conf', headers=requests_header) -for url in r.text.split("\n")[:-1]: - url = re.sub(r'(server=\/)', '', url) # 清除前缀 - url = re.sub(r'(/114.114.114.114)', '', url) # 清除后缀 - domains_direct.append(url) - -# write files -file_proxy = open('resultant/top50_proxy.list', 'w', encoding='utf-8') -file_direct = open('resultant/top50_direct.list', 'w', encoding='utf-8') - -now_time = time.strftime("%Y-%m-%d %H:%M:%S") -file_proxy.write('# top50 proxy list update time: ' + now_time + '\n') -file_direct.write('# top50 direct list update time: ' + now_time + '\n') - -domains_direct = list( set(domains_direct) ) -domains_proxy = list( set(domains_proxy) ) -domains_direct.sort() -domains_proxy.sort() - -for domain in domains_direct: - file_direct.write(domain+'\n') -for domain in domains_proxy: - file_proxy.write(domain+'\n') - -print('{:-^30}'.format('Done!')) diff --git a/factory/top500.py b/factory/top500.py new file mode 100644 index 0000000..b0538c3 --- /dev/null +++ b/factory/top500.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- + +''' +#警告: +由于站长之家已不再免费提供全球top500名单,此脚本已无法使用 +这里仅仅是做备份,待可靠top500名单源时将会重新发布 +''' + +from bs4 import BeautifulSoup +import threading +import time +import sys +import requests +import re + +urls = ['http://alexa.chinaz.com/Global/index.html'] +for i in range(2,21): + urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i) + +urls_scan_over = False + +domains = [] + +domains_proxy = [] +domains_direct = [] + + +# thread to scan pages in urls +class UrlScaner(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + + def run(self): + global urls_scan_over, urls + + done_num = 0 + + while len(urls): + html = self.fetchHTML( urls.pop(0) ) + self.praseHTML(html) + + done_num = done_num + 25 + print('top500 List Got: %d/500'%done_num) + + time.sleep(1) + + urls_scan_over = True + print('top500 List Fetched Over.') + + + def fetchHTML(self, url): + success = False + try_times = 0 + r = None + while try_times < 5 and not success: + r = requests.get(url) + if r.status_code != 200: + time.sleep(1) + try_times = try_times + 1 + else: + success = True + break + + if not success: + sys.exit('error in request %s\n\treturn code: %d' % (url, r.status_code) ) + + r.encoding = 'utf-8' + return r.text + + + def praseHTML(self, html): + soup = BeautifulSoup(html, "lxml") + namesDom = soup.select("div.righttxt h3 span") + + for name in namesDom: + domains.append(name.string) + + +requests_header = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', + 'Cache-Control': 'max-age=0', + 'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2', + 'Connection': 'keep-alive' +} + + +# thread to visit websites +class DomainScaner(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + + def run(self): + while not urls_scan_over or len(domains): + if len(domains) == 0: + time.sleep(2) + continue + + domain = domains.pop(0) + + if domain.endswith('.cn'): + continue + if 'google' in domain: + continue + + is_proxy = False + + try: + requests.get('http://www.' + domain, timeout=10, headers=requests_header) + except BaseException: + try: + requests.get('http://' + domain, timeout=10, headers=requests_header) + except BaseException: + is_proxy = True + + if is_proxy: + domains_proxy.append(domain) + else: + domains_direct.append(domain) + + print('[Doamins Remain: %d]\tProxy %s:%s' % (len(domains), is_proxy, domain) ) + + global scaner_thread_num + scaner_thread_num -= 1 + + +print('top500 Script Starting...\n\n') + +# Start Thread +UrlScaner().start() +scaner_thread_num = 0 +for i in range(3): + DomainScaner().start() + scaner_thread_num += 1 + +# wait thread done +while scaner_thread_num: + pass + +# 将苹果IP加入直连 +# 由于本脚本应当运行在内部环境中,可能无法访问Github,故改用staticdn.net提供的CDN节点 +# r = requests.get(url="https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf", headers=requests_header) +print('{:-^30}'.format('将苹果IP加入直连')) +print('\n\n') +r = requests.get(url='https://raw.staticdn.net/felixonmars/dnsmasq-china-list/master/apple.china.conf', headers=requests_header) +for url in r.text.split("\n")[:-1]: + url = re.sub(r'(server=\/)', '', url) # 清除前缀 + url = re.sub(r'(/114.114.114.114)', '', url) # 清除后缀 + domains_direct.append(url) + +# write files +file_proxy = open('resultant/top50_proxy.list', 'w', encoding='utf-8') +file_direct = open('resultant/top50_direct.list', 'w', encoding='utf-8') + +now_time = time.strftime("%Y-%m-%d %H:%M:%S") +file_proxy.write('# top50 proxy list update time: ' + now_time + '\n') +file_direct.write('# top50 direct list update time: ' + now_time + '\n') + +domains_direct = list( set(domains_direct) ) +domains_proxy = list( set(domains_proxy) ) +domains_direct.sort() +domains_proxy.sort() + +for domain in domains_direct: + file_direct.write(domain+'\n') +for domain in domains_proxy: + file_proxy.write(domain+'\n') + +print('{:-^30}'.format('Done!'))