Files
Johnshall-Shadowrocket-ADBl…/factory/top50.py
2021-12-05 16:49:20 +08:00

114 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import threading
import time
import sys
import requests
import re
'''
# 更新说明:
# 由于原引 http://alexa.chinaz.com/Global/index.html 仅提供前 50 榜单,且难以爬取,
# 我们因此改为从 www.similarweb.com 中爬去世界前 50 榜单
# 由于世界前 500 网站列表无法以免费的方式呈现,所以改为较有代表性的前 50 榜单
urls = ['http://alexa.chinaz.com/Global/index.html']
for i in range(2,21):
urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i)
'''
urls = 'https://www.similarweb.com/zh/top-websites/united-states/?utm_source=addon&utm_medium=chrome&utm_content=overview&utm_campaign=country-rank'
urls_scan_over = False
domains = []
domains_proxy = []
domains_direct = []
requests_header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Cache-Control': 'max-age=0',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
'Connection': 'keep-alive'
}
def getTop(urls):
global requests_header
r = requests.get(url = 'https://www.similarweb.com/zh/top-websites/united-states/?utm_source=addon&utm_medium=chrome&utm_content=overview&utm_campaign=country-rank',
headers=requests_header)
soup = BeautifulSoup(r.text, "lxml")
namesDom = soup.select("span.topRankingGrid-titleName")
for name in namesDom:
domains.append(name.string)
print('{:-^30}'.format('We get!'))
print('{:-^30}'.format('Top50 Fetching over'))
print('\n')
print('\n\n')
# Start
print('{:-^30}'.format('Top50 Script Starting'))
print('\n')
getTop(urls)
# thread to visit websites
while len(domains):
domain = domains.pop(0)
is_proxy = False
try:
requests.get('http://www.' + domain, timeout=10, headers=requests_header)
except BaseException:
try:
requests.get('http://' + domain, timeout=10, headers=requests_header)
except BaseException:
is_proxy = True
if is_proxy:
domains_proxy.append(domain)
print(domain + "is proxy\n")
else:
domains_direct.append(domain)
print(domain + "is direct\n")
print('[Doamins Remain: %d]\tProxy %s%s\n' % (len(domains), is_proxy, domain) )
# 将苹果IP加入直连
# 由于本脚本应当运行在内部环境中可能无法访问Github故改用staticdn.net提供的CDN节点
# r = requests.get(url="https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf", headers=requests_header)
print('{:-^30}'.format('将苹果IP加入直连'))
print('\n\n')
r = requests.get(url='https://raw.staticdn.net/felixonmars/dnsmasq-china-list/master/apple.china.conf', headers=requests_header)
for url in r.text.split("\n")[:-1]:
url = re.sub(r'(server=\/)', '', url) # 清除前缀
url = re.sub(r'(/114.114.114.114)', '', url) # 清除后缀
domains_direct.append(url)
# write files
file_proxy = open('resultant/top50_proxy.list', 'w', encoding='utf-8')
file_direct = open('resultant/top50_direct.list', 'w', encoding='utf-8')
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
file_proxy.write('# top50 proxy list update time: ' + now_time + '\n')
file_direct.write('# top50 direct list update time: ' + now_time + '\n')
domains_direct = list( set(domains_direct) )
domains_proxy = list( set(domains_proxy) )
domains_direct.sort()
domains_proxy.sort()
for domain in domains_direct:
file_direct.write(domain+'\n')
for domain in domains_proxy:
file_proxy.write(domain+'\n')
print('{:-^30}'.format('Done!'))