mirror of
https://github.com/Johnshall/Shadowrocket-ADBlock-Rules-Forever.git
synced 2025-12-18 15:54:50 +08:00
Remove top50.py and back to top500.py. Just memo.
This commit is contained in:
113
factory/top50.py
113
factory/top50.py
@@ -1,113 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
import sys
|
|
||||||
import requests
|
|
||||||
import re
|
|
||||||
|
|
||||||
'''
|
|
||||||
# 更新说明:
|
|
||||||
|
|
||||||
# 由于原引 http://alexa.chinaz.com/Global/index.html 仅提供前 50 榜单,且难以爬取,
|
|
||||||
# 我们因此改为从 www.similarweb.com 中爬去世界前 50 榜单
|
|
||||||
# 由于世界前 500 网站列表无法以免费的方式呈现,所以改为较有代表性的前 50 榜单
|
|
||||||
|
|
||||||
urls = ['http://alexa.chinaz.com/Global/index.html']
|
|
||||||
for i in range(2,21):
|
|
||||||
urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i)
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
urls = 'https://www.similarweb.com/zh/top-websites/united-states/?utm_source=addon&utm_medium=chrome&utm_content=overview&utm_campaign=country-rank'
|
|
||||||
|
|
||||||
urls_scan_over = False
|
|
||||||
|
|
||||||
domains = []
|
|
||||||
|
|
||||||
domains_proxy = []
|
|
||||||
domains_direct = []
|
|
||||||
|
|
||||||
requests_header = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
|
|
||||||
'Cache-Control': 'max-age=0',
|
|
||||||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
|
|
||||||
'Connection': 'keep-alive'
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def getTop(urls):
|
|
||||||
global requests_header
|
|
||||||
r = requests.get(url = 'https://www.similarweb.com/zh/top-websites/united-states/?utm_source=addon&utm_medium=chrome&utm_content=overview&utm_campaign=country-rank',
|
|
||||||
headers=requests_header)
|
|
||||||
soup = BeautifulSoup(r.text, "lxml")
|
|
||||||
namesDom = soup.select("span.topRankingGrid-titleName")
|
|
||||||
|
|
||||||
for name in namesDom:
|
|
||||||
domains.append(name.string)
|
|
||||||
|
|
||||||
print('{:-^30}'.format('We get!'))
|
|
||||||
print('{:-^30}'.format('Top50 Fetching over'))
|
|
||||||
print('\n')
|
|
||||||
print('\n\n')
|
|
||||||
|
|
||||||
|
|
||||||
# Start
|
|
||||||
print('{:-^30}'.format('Top50 Script Starting'))
|
|
||||||
print('\n')
|
|
||||||
getTop(urls)
|
|
||||||
|
|
||||||
# thread to visit websites
|
|
||||||
while len(domains):
|
|
||||||
domain = domains.pop(0)
|
|
||||||
is_proxy = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
requests.get('http://www.' + domain, timeout=10, headers=requests_header)
|
|
||||||
except BaseException:
|
|
||||||
try:
|
|
||||||
requests.get('http://' + domain, timeout=10, headers=requests_header)
|
|
||||||
except BaseException:
|
|
||||||
is_proxy = True
|
|
||||||
|
|
||||||
if is_proxy:
|
|
||||||
domains_proxy.append(domain)
|
|
||||||
print(domain + "is proxy\n")
|
|
||||||
else:
|
|
||||||
domains_direct.append(domain)
|
|
||||||
print(domain + "is direct\n")
|
|
||||||
|
|
||||||
print('[Doamins Remain: %d]\tProxy %s:%s\n' % (len(domains), is_proxy, domain) )
|
|
||||||
|
|
||||||
|
|
||||||
# 将苹果IP加入直连
|
|
||||||
# 由于本脚本应当运行在内部环境中,可能无法访问Github,故改用staticdn.net提供的CDN节点
|
|
||||||
# r = requests.get(url="https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf", headers=requests_header)
|
|
||||||
print('{:-^30}'.format('将苹果IP加入直连'))
|
|
||||||
print('\n\n')
|
|
||||||
r = requests.get(url='https://raw.staticdn.net/felixonmars/dnsmasq-china-list/master/apple.china.conf', headers=requests_header)
|
|
||||||
for url in r.text.split("\n")[:-1]:
|
|
||||||
url = re.sub(r'(server=\/)', '', url) # 清除前缀
|
|
||||||
url = re.sub(r'(/114.114.114.114)', '', url) # 清除后缀
|
|
||||||
domains_direct.append(url)
|
|
||||||
|
|
||||||
# write files
|
|
||||||
file_proxy = open('resultant/top50_proxy.list', 'w', encoding='utf-8')
|
|
||||||
file_direct = open('resultant/top50_direct.list', 'w', encoding='utf-8')
|
|
||||||
|
|
||||||
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
file_proxy.write('# top50 proxy list update time: ' + now_time + '\n')
|
|
||||||
file_direct.write('# top50 direct list update time: ' + now_time + '\n')
|
|
||||||
|
|
||||||
domains_direct = list( set(domains_direct) )
|
|
||||||
domains_proxy = list( set(domains_proxy) )
|
|
||||||
domains_direct.sort()
|
|
||||||
domains_proxy.sort()
|
|
||||||
|
|
||||||
for domain in domains_direct:
|
|
||||||
file_direct.write(domain+'\n')
|
|
||||||
for domain in domains_proxy:
|
|
||||||
file_proxy.write(domain+'\n')
|
|
||||||
|
|
||||||
print('{:-^30}'.format('Done!'))
|
|
||||||
168
factory/top500.py
Normal file
168
factory/top500.py
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
'''
|
||||||
|
#警告:
|
||||||
|
由于站长之家已不再免费提供全球top500名单,此脚本已无法使用
|
||||||
|
这里仅仅是做备份,待可靠top500名单源时将会重新发布
|
||||||
|
'''
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
|
||||||
|
urls = ['http://alexa.chinaz.com/Global/index.html']
|
||||||
|
for i in range(2,21):
|
||||||
|
urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i)
|
||||||
|
|
||||||
|
urls_scan_over = False
|
||||||
|
|
||||||
|
domains = []
|
||||||
|
|
||||||
|
domains_proxy = []
|
||||||
|
domains_direct = []
|
||||||
|
|
||||||
|
|
||||||
|
# thread to scan pages in urls
|
||||||
|
class UrlScaner(threading.Thread):
|
||||||
|
def __init__(self):
|
||||||
|
threading.Thread.__init__(self)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
global urls_scan_over, urls
|
||||||
|
|
||||||
|
done_num = 0
|
||||||
|
|
||||||
|
while len(urls):
|
||||||
|
html = self.fetchHTML( urls.pop(0) )
|
||||||
|
self.praseHTML(html)
|
||||||
|
|
||||||
|
done_num = done_num + 25
|
||||||
|
print('top500 List Got: %d/500'%done_num)
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
urls_scan_over = True
|
||||||
|
print('top500 List Fetched Over.')
|
||||||
|
|
||||||
|
|
||||||
|
def fetchHTML(self, url):
|
||||||
|
success = False
|
||||||
|
try_times = 0
|
||||||
|
r = None
|
||||||
|
while try_times < 5 and not success:
|
||||||
|
r = requests.get(url)
|
||||||
|
if r.status_code != 200:
|
||||||
|
time.sleep(1)
|
||||||
|
try_times = try_times + 1
|
||||||
|
else:
|
||||||
|
success = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
sys.exit('error in request %s\n\treturn code: %d' % (url, r.status_code) )
|
||||||
|
|
||||||
|
r.encoding = 'utf-8'
|
||||||
|
return r.text
|
||||||
|
|
||||||
|
|
||||||
|
def praseHTML(self, html):
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
namesDom = soup.select("div.righttxt h3 span")
|
||||||
|
|
||||||
|
for name in namesDom:
|
||||||
|
domains.append(name.string)
|
||||||
|
|
||||||
|
|
||||||
|
requests_header = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
|
||||||
|
'Connection': 'keep-alive'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# thread to visit websites
|
||||||
|
class DomainScaner(threading.Thread):
|
||||||
|
def __init__(self):
|
||||||
|
threading.Thread.__init__(self)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
while not urls_scan_over or len(domains):
|
||||||
|
if len(domains) == 0:
|
||||||
|
time.sleep(2)
|
||||||
|
continue
|
||||||
|
|
||||||
|
domain = domains.pop(0)
|
||||||
|
|
||||||
|
if domain.endswith('.cn'):
|
||||||
|
continue
|
||||||
|
if 'google' in domain:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_proxy = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
requests.get('http://www.' + domain, timeout=10, headers=requests_header)
|
||||||
|
except BaseException:
|
||||||
|
try:
|
||||||
|
requests.get('http://' + domain, timeout=10, headers=requests_header)
|
||||||
|
except BaseException:
|
||||||
|
is_proxy = True
|
||||||
|
|
||||||
|
if is_proxy:
|
||||||
|
domains_proxy.append(domain)
|
||||||
|
else:
|
||||||
|
domains_direct.append(domain)
|
||||||
|
|
||||||
|
print('[Doamins Remain: %d]\tProxy %s:%s' % (len(domains), is_proxy, domain) )
|
||||||
|
|
||||||
|
global scaner_thread_num
|
||||||
|
scaner_thread_num -= 1
|
||||||
|
|
||||||
|
|
||||||
|
print('top500 Script Starting...\n\n')
|
||||||
|
|
||||||
|
# Start Thread
|
||||||
|
UrlScaner().start()
|
||||||
|
scaner_thread_num = 0
|
||||||
|
for i in range(3):
|
||||||
|
DomainScaner().start()
|
||||||
|
scaner_thread_num += 1
|
||||||
|
|
||||||
|
# wait thread done
|
||||||
|
while scaner_thread_num:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 将苹果IP加入直连
|
||||||
|
# 由于本脚本应当运行在内部环境中,可能无法访问Github,故改用staticdn.net提供的CDN节点
|
||||||
|
# r = requests.get(url="https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/apple.china.conf", headers=requests_header)
|
||||||
|
print('{:-^30}'.format('将苹果IP加入直连'))
|
||||||
|
print('\n\n')
|
||||||
|
r = requests.get(url='https://raw.staticdn.net/felixonmars/dnsmasq-china-list/master/apple.china.conf', headers=requests_header)
|
||||||
|
for url in r.text.split("\n")[:-1]:
|
||||||
|
url = re.sub(r'(server=\/)', '', url) # 清除前缀
|
||||||
|
url = re.sub(r'(/114.114.114.114)', '', url) # 清除后缀
|
||||||
|
domains_direct.append(url)
|
||||||
|
|
||||||
|
# write files
|
||||||
|
file_proxy = open('resultant/top50_proxy.list', 'w', encoding='utf-8')
|
||||||
|
file_direct = open('resultant/top50_direct.list', 'w', encoding='utf-8')
|
||||||
|
|
||||||
|
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
file_proxy.write('# top50 proxy list update time: ' + now_time + '\n')
|
||||||
|
file_direct.write('# top50 direct list update time: ' + now_time + '\n')
|
||||||
|
|
||||||
|
domains_direct = list( set(domains_direct) )
|
||||||
|
domains_proxy = list( set(domains_proxy) )
|
||||||
|
domains_direct.sort()
|
||||||
|
domains_proxy.sort()
|
||||||
|
|
||||||
|
for domain in domains_direct:
|
||||||
|
file_direct.write(domain+'\n')
|
||||||
|
for domain in domains_proxy:
|
||||||
|
file_proxy.write(domain+'\n')
|
||||||
|
|
||||||
|
print('{:-^30}'.format('Done!'))
|
||||||
Reference in New Issue
Block a user