Files
Johnshall-Shadowrocket-ADBl…/factory/top500_manual.py
2022-05-23 12:16:58 +08:00

107 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
'''
此脚本用于对 top500_manual.list 中网站进行评估,判断需要直连或代理
该脚本应当在内网环境中运行
'''
import pandas as pd
import requests
import time
import threading
import csv
url = 'https://moz.com/top-500/download/?table=top500Domains'
r = requests.get(url)
with open("top500Domains.csv", "wb") as code:
code.write(r.content)
with open('top500Domains_new.csv','r') as csvfile:
reader = csv.reader(csvfile)
with open("resultant/top500_manual.list", "w") as file_domain_in:
for domain_i,rows in enumerate(reader):
if domain_i != 1:
for domain_n in reader:
file_domain_in.write(domain_n[1] + '\n')
domains = []
with open("resultant/top500_manual.list", "r", encoding='utf-8') as f:
for domain in f.readlines():
if domain[0] == "#":
continue
domains.append(domain[:-1])
domains_proxy = []
domains_direct = []
requests_header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Cache-Control': 'max-age=0',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
'Connection': 'keep-alive'
}
# thread to judge direct/proxy
class DomainScaner(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
while len(domains):
domain = domains.pop(0)
is_proxy = False
try:
requests.get('http://www.' + domain, timeout=10, headers=requests_header)
except BaseException:
try:
requests.get('http://' + domain, timeout=10, headers=requests_header)
except BaseException:
is_proxy = True
if is_proxy:
domains_proxy.append(domain)
else:
domains_direct.append(domain)
print('[Doamins Remain: %d]\tProxy %s%s' % (len(domains), is_proxy, domain) )
global scaner_thread_num
scaner_thread_num -= 1
print('top500 Script Starting...\n\n')
# Start Thread
scaner_thread_num = 0
for i in range(5):
DomainScaner().start()
scaner_thread_num += 1
# wait thread done
while scaner_thread_num:
pass
# write files
file_proxy = open('resultant/top500_proxy.list', 'w', encoding='utf-8')
file_direct = open('resultant/top500_direct.list', 'w', encoding='utf-8')
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
file_proxy.write('# top500 proxy list update time: ' + now_time + '\n')
file_direct.write('# top500 direct list update time: ' + now_time + '\n')
domains_direct = list( set(domains_direct) )
domains_proxy = list( set(domains_proxy) )
domains_direct.sort()
domains_proxy.sort()
for domain in domains_direct:
file_direct.write(domain+'\n')
for domain in domains_proxy:
file_proxy.write(domain+'\n')
print('{:-^30}'.format('Done!'))