mirror of
https://github.com/Johnshall/Shadowrocket-ADBlock-Rules-Forever.git
synced 2025-12-18 07:44:57 +08:00
super updates with python
This commit is contained in:
56
factory/README.md
Normal file
56
factory/README.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# 规则文件开发说明
|
||||
|
||||
这里是规则文件的生成车间,欢迎访问。
|
||||
|
||||
|
||||
## 规则模板
|
||||
|
||||
`template/` 目录下为规则模板,`build_confs.py` 脚本运行时会按照模板生成规则文件。
|
||||
|
||||
每个规则对应一个模板,不过 `sr_head.txt` 和 `sr_foot.txt` 是例外,这两个文件是所有模板的公共的头部和尾部。
|
||||
|
||||
|
||||
## 手工配置的文件
|
||||
|
||||
**manual_direct.txt**
|
||||
|
||||
列表,手动编写。记录走直连的域名或 IP。
|
||||
|
||||
**manual_proxy.txt**
|
||||
|
||||
列表,手动编写。记录走代理的域名或 IP。
|
||||
|
||||
**manual_reject.txt**
|
||||
|
||||
列表,手动编写。记录需要屏蔽的域名或 IP。
|
||||
|
||||
|
||||
## 代码及自动生成的文件
|
||||
|
||||
**resultant/top500_direct.list**
|
||||
|
||||
域名列表,由 `top500.py` 自动生成。记录着前 500 网站中所有可直连网站的域名,并已排除了以 `.cn` 结尾的域名。
|
||||
|
||||
**resultant/top500_proxy.list**
|
||||
|
||||
域名列表,由 `top500.py` 自动生成。记录着前 500 网站中无法直连网站的域名。
|
||||
|
||||
其中未包括含有 `google` 关键字的域名,并且首页请求时间大于 10 秒也视为无法直连。
|
||||
|
||||
**top500.py**
|
||||
|
||||
脚本,运行所需时间较长。自动爬取生成 `top500_*.list` 文件。
|
||||
|
||||
|
||||
**resultant/ad_domains.list**
|
||||
|
||||
域名列表,由 `ad.py` 自动生成。包括所有需要屏蔽的广告域名。
|
||||
|
||||
**resultant/ad_ips.list**
|
||||
|
||||
IP 列表,由 `ad.py` 自动生成。包括所有需要屏蔽的广告服务器的 IP。
|
||||
|
||||
**ad.py**
|
||||
|
||||
脚本,从指定的 Adblock Rule 中提取广告服务器的域名和 IP 至 `ad_*.list` 文件。
|
||||
|
||||
59
factory/ad.py
Normal file
59
factory/ad.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import time
|
||||
import sys
|
||||
import requests
|
||||
import re
|
||||
|
||||
|
||||
rules_url = [
|
||||
'https://easylist-downloads.adblockplus.org/easylistchina.txt', # EasyList China
|
||||
'https://github.com/cjx82630/cjxlist/raw/master/cjxlist.txt' # EasyList Lite
|
||||
]
|
||||
|
||||
# contain both domains and ips
|
||||
domains = []
|
||||
|
||||
|
||||
for rule_url in rules_url:
|
||||
print('loading... ' + rule_url)
|
||||
|
||||
# get rule text
|
||||
success = False
|
||||
try_times = 0
|
||||
while try_times < 5 and not success:
|
||||
r = requests.get(rule_url)
|
||||
if r.status_code != 200:
|
||||
time.sleep(1)
|
||||
try_times = try_times + 1
|
||||
else:
|
||||
success = True
|
||||
break
|
||||
|
||||
if not success:
|
||||
sys.exit('error in request %s\n\treturn code: %d' % (rule_url, r.status_code) )
|
||||
|
||||
rule = r.text
|
||||
|
||||
# parse html
|
||||
reg_ret = re.findall(r'\|\|([\w\.]+)\^?\n', rule)
|
||||
for ret in reg_ret:
|
||||
domains.append(ret)
|
||||
|
||||
print('done.')
|
||||
|
||||
|
||||
# write in files
|
||||
|
||||
domains.sort()
|
||||
|
||||
file_ad = open('resultant/ad.list', 'w', encoding='utf-8')
|
||||
|
||||
file_ad.write('# ad rules refresh time: ' + time.strftime("%Y-%m-%d %H:%M:%S") + '\n')
|
||||
|
||||
last = ''
|
||||
for item in domains:
|
||||
if last == item:
|
||||
continue
|
||||
file_ad.write(item + '\n')
|
||||
last = item
|
||||
76
factory/build_confs.py
Normal file
76
factory/build_confs.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import time
|
||||
|
||||
|
||||
# confs names in template/ and ../
|
||||
# except sr_head and sr_foot
|
||||
confs_names = [
|
||||
'sr_top500_banlist',
|
||||
'sr_top500_banlist_ad',
|
||||
'sr_top500_whitelist',
|
||||
'sr_top500_whitelist_ad'
|
||||
]
|
||||
|
||||
|
||||
def getRulesStringFromFile(path, kind):
|
||||
file = open(path, 'r', encoding='utf-8')
|
||||
contents = file.readlines()
|
||||
file.close()
|
||||
ret = ''
|
||||
|
||||
for content in contents:
|
||||
content = content.strip('\r\n')
|
||||
if not len(content):
|
||||
continue
|
||||
|
||||
if content.startswith('#'):
|
||||
ret += content + '\n'
|
||||
else:
|
||||
prefix = 'DOMAIN-SUFFIX'
|
||||
if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', content):
|
||||
prefix = 'IP-CIDR'
|
||||
if '/' not in content:
|
||||
content += '/32'
|
||||
|
||||
ret += prefix + ',%s,%s\n' % (content, kind)
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
# get head and foot
|
||||
str_head = open('template/sr_head.txt', 'r', encoding='utf-8').read()
|
||||
str_foot = open('template/sr_foot.txt', 'r', encoding='utf-8').read()
|
||||
|
||||
|
||||
# make values
|
||||
values = {}
|
||||
|
||||
values['build_time'] = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
values['top500_proxy'] = getRulesStringFromFile('resultant/top500_proxy.list', 'Proxy')
|
||||
values['top500_direct'] = getRulesStringFromFile('resultant/top500_direct.list', 'Direct')
|
||||
|
||||
values['ad'] = getRulesStringFromFile('resultant/ad.list', 'Reject')
|
||||
|
||||
values['manual_direct'] = getRulesStringFromFile('manual_direct.txt', 'Direct')
|
||||
values['manual_proxy'] = getRulesStringFromFile('manual_proxy.txt', 'Proxy')
|
||||
values['manual_reject'] = getRulesStringFromFile('manual_reject.txt', 'Reject')
|
||||
|
||||
|
||||
# make confs
|
||||
for conf_name in confs_names:
|
||||
file_template = open('template/'+conf_name+'.txt', 'r', encoding='utf-8')
|
||||
template = file_template.read()
|
||||
|
||||
template = str_head + template + str_foot
|
||||
|
||||
file_output = open('../'+conf_name+'.conf', 'w', encoding='utf-8')
|
||||
|
||||
marks = re.findall(r'{{(.+)}}', template)
|
||||
|
||||
for mark in marks:
|
||||
template = template.replace('{{'+mark+'}}', values[mark])
|
||||
|
||||
file_output.write(template)
|
||||
0
factory/manual_direct.txt
Normal file
0
factory/manual_direct.txt
Normal file
12
factory/manual_proxy.txt
Normal file
12
factory/manual_proxy.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
#Telegram
|
||||
67.198.55.0/24
|
||||
91.108.4.0/22
|
||||
91.108.8.0/22
|
||||
91.108.12.0/22
|
||||
91.108.16.0/22
|
||||
91.108.56.0/22
|
||||
109.239.140.0/24
|
||||
149.154.160.0/20
|
||||
149.154.164.0/22
|
||||
149.154.168.0/22
|
||||
149.154.172.0/22
|
||||
0
factory/manual_reject.txt
Normal file
0
factory/manual_reject.txt
Normal file
1240
factory/resultant/ad.list
Normal file
1240
factory/resultant/ad.list
Normal file
File diff suppressed because it is too large
Load Diff
353
factory/resultant/top500_direct.list
Normal file
353
factory/resultant/top500_direct.list
Normal file
@@ -0,0 +1,353 @@
|
||||
baidu.com
|
||||
yahoo.com
|
||||
qq.com
|
||||
wikipedia.org
|
||||
amazon.com
|
||||
taobao.com
|
||||
msn.com
|
||||
weibo.com
|
||||
bing.com
|
||||
linkedin.com
|
||||
live.com
|
||||
hao123.com
|
||||
yahoo.co.jp
|
||||
vk.com
|
||||
yandex.ru
|
||||
yahoo.co.jp
|
||||
ebay.com
|
||||
onclickads.net
|
||||
soso.com
|
||||
bing.com
|
||||
ebay.com
|
||||
yandex.ru
|
||||
stackoverflow.com
|
||||
wordpress.com
|
||||
msn.com
|
||||
aliexpress.com
|
||||
apple.com
|
||||
microsoft.com
|
||||
netflix.com
|
||||
imgur.com
|
||||
amazon.in
|
||||
github.com
|
||||
imdb.com
|
||||
mail.ru
|
||||
popads.net
|
||||
paypal.com
|
||||
amazon.co.jp
|
||||
cnzz.com
|
||||
office.com
|
||||
whatsapp.com
|
||||
youku.com
|
||||
adobe.com
|
||||
microsoftonline.com
|
||||
xinhuanet.com
|
||||
cnn.com
|
||||
xhamster.com
|
||||
twitch.tv
|
||||
alibaba.com
|
||||
adf.ly
|
||||
coccoc.com
|
||||
amazon.de
|
||||
quora.com
|
||||
360.com
|
||||
wikia.com
|
||||
so.com
|
||||
bbc.com
|
||||
haosou.com
|
||||
diply.com
|
||||
amazon.co.uk
|
||||
rakuten.co.jp
|
||||
godaddy.com
|
||||
amazonaws.com
|
||||
nicovideo.jp
|
||||
espn.com
|
||||
bbc.co.uk
|
||||
salesforce.com
|
||||
booking.com
|
||||
terraclicks.com
|
||||
zhihu.com
|
||||
soundcloud.com
|
||||
craigslist.org
|
||||
ebay.co.uk
|
||||
ebay.de
|
||||
uol.com.br
|
||||
alipay.com
|
||||
daum.net
|
||||
stackexchange.com
|
||||
savefrom.net
|
||||
tudou.com
|
||||
dailymail.co.uk
|
||||
thewhizmarketing.com
|
||||
cctv.com
|
||||
wordpress.org
|
||||
deviantart.com
|
||||
livejasmin.com
|
||||
indeed.com
|
||||
uptodown.com
|
||||
w3schools.com
|
||||
ettoday.net
|
||||
avito.ru
|
||||
blastingnews.com
|
||||
huffingtonpost.com
|
||||
trello.com
|
||||
theguardian.com
|
||||
steampowered.com
|
||||
chase.com
|
||||
cnet.com
|
||||
outbrain.com
|
||||
buzzfeed.com
|
||||
9gag.com
|
||||
mediafire.com
|
||||
snapdeal.com
|
||||
popcash.net
|
||||
force.com
|
||||
slack.com
|
||||
bilibili.com
|
||||
github.io
|
||||
etsy.com
|
||||
tribunnews.com
|
||||
vice.com
|
||||
onlinesbi.com
|
||||
sogou.com
|
||||
china.com
|
||||
ameblo.jp
|
||||
washingtonpost.com
|
||||
adexchangeprediction.com
|
||||
wikihow.com
|
||||
babytree.com
|
||||
spotify.com
|
||||
bankofamerica.com
|
||||
detail.tmall.com
|
||||
ozock.com
|
||||
livejournal.com
|
||||
mercadolivre.com.br
|
||||
forbes.com
|
||||
csdn.net
|
||||
tistory.com
|
||||
weather.com
|
||||
huanqiu.com
|
||||
naver.jp
|
||||
kinogo.club
|
||||
detik.com
|
||||
goo.ne.jp
|
||||
mozilla.org
|
||||
skype.com
|
||||
youm7.com
|
||||
walmart.com
|
||||
foxnews.com
|
||||
isanalyze.com
|
||||
amazon.it
|
||||
amazon.fr
|
||||
wellsfargo.com
|
||||
openload.co
|
||||
gfycat.com
|
||||
wikimedia.org
|
||||
wetransfer.com
|
||||
reimageplus.com
|
||||
myway.com
|
||||
wordreference.com
|
||||
giphy.com
|
||||
feedly.com
|
||||
nih.gov
|
||||
rutracker.org
|
||||
espncricinfo.com
|
||||
weebly.com
|
||||
51.la
|
||||
yelp.com
|
||||
iwanttodeliver.com
|
||||
theladbible.com
|
||||
ikea.com
|
||||
freepik.com
|
||||
tripadvisor.com
|
||||
businessinsider.com
|
||||
instructure.com
|
||||
163.com
|
||||
aol.com
|
||||
sourceforge.net
|
||||
taboola.com
|
||||
doublepimp.com
|
||||
allegro.pl
|
||||
zippyshare.com
|
||||
varzesh3.com
|
||||
softonic.com
|
||||
rt.com
|
||||
mailchimp.com
|
||||
zillow.com
|
||||
hdfcbank.com
|
||||
upwork.com
|
||||
kinopoisk.ru
|
||||
onclickpredictiv.com
|
||||
gearbest.com
|
||||
dmm.co.jp
|
||||
zendesk.com
|
||||
themeforest.net
|
||||
fiverr.com
|
||||
daikynguyenvn.com
|
||||
addthis.com
|
||||
douyu.com
|
||||
webtretho.com
|
||||
douban.com
|
||||
spotscenered.info
|
||||
123movies.to
|
||||
thesaurus.com
|
||||
rarbg.to
|
||||
speedtest.net
|
||||
xywy.com
|
||||
sh.st
|
||||
ndtv.com
|
||||
tokopedia.com
|
||||
rambler.ru
|
||||
gsmarena.com
|
||||
nametests.com
|
||||
leboncoin.fr
|
||||
ebay-kleinanzeigen.de
|
||||
wittyfeed.com
|
||||
icicibank.com
|
||||
asos.com
|
||||
amazon.es
|
||||
clicksgear.com
|
||||
wix.com
|
||||
secureserver.net
|
||||
iqiyi.com
|
||||
onedio.com
|
||||
directrev.com
|
||||
youtube-mp3.org
|
||||
kakaku.com
|
||||
paytm.com
|
||||
irctc.co.in
|
||||
goodreads.com
|
||||
nfl.com
|
||||
battle.net
|
||||
behance.net
|
||||
hatenablog.com
|
||||
roblox.com
|
||||
39.net
|
||||
hp.com
|
||||
icloud.com
|
||||
oracle.com
|
||||
hatena.ne.jp
|
||||
evernote.com
|
||||
thewhizproducts.com
|
||||
go.com
|
||||
repubblica.it
|
||||
1688.com
|
||||
zoho.com
|
||||
2ch.net
|
||||
shopify.com
|
||||
yesky.com
|
||||
jd.com
|
||||
samsung.com
|
||||
bloomberg.com
|
||||
seasonvar.ru
|
||||
canva.com
|
||||
1905.com
|
||||
cnblogs.com
|
||||
blkget.com
|
||||
adnetworkperformance.com
|
||||
kompas.com
|
||||
sharepoint.com
|
||||
mediawhirl.net
|
||||
kissanime.to
|
||||
americanexpress.com
|
||||
quizlet.com
|
||||
telegraph.co.uk
|
||||
hotstar.com
|
||||
steamcommunity.com
|
||||
kaskus.co.id
|
||||
liputan6.com
|
||||
gmx.net
|
||||
xfinity.com
|
||||
51yes.com
|
||||
onet.pl
|
||||
pandora.com
|
||||
yts.ag
|
||||
abs-cbn.com
|
||||
bukalapak.com
|
||||
atlassian.net
|
||||
dictionary.com
|
||||
sberbank.ru
|
||||
web.de
|
||||
sabah.com.tr
|
||||
pixabay.com
|
||||
ruten.com.tw
|
||||
putlocker.is
|
||||
udemy.com
|
||||
ebay.in
|
||||
fivethirtyeight.com
|
||||
target.com
|
||||
blackboard.com
|
||||
digikala.com
|
||||
infusionsoft.com
|
||||
sciencedirect.com
|
||||
accuweather.com
|
||||
mi.com
|
||||
blog.jp
|
||||
conservativetribune.com
|
||||
weblio.jp
|
||||
wixsite.com
|
||||
techcrunch.com
|
||||
neobux.com
|
||||
kickstarter.com
|
||||
airbnb.com
|
||||
avg.com
|
||||
dell.com
|
||||
ups.com
|
||||
capitalone.com
|
||||
tutorialspoint.com
|
||||
ebay.it
|
||||
usatoday.com
|
||||
homedepot.com
|
||||
ign.com
|
||||
usps.com
|
||||
leagueoflegends.com
|
||||
box.com
|
||||
kooora.com
|
||||
researchgate.net
|
||||
about.com
|
||||
spiegel.de
|
||||
poptm.com
|
||||
baike.com
|
||||
taleo.net
|
||||
ouo.io
|
||||
marca.com
|
||||
subscene.com
|
||||
hubspot.com
|
||||
php.net
|
||||
qiita.com
|
||||
thefreedictionary.com
|
||||
fastpokemap.se
|
||||
oeeee.com
|
||||
list-manage.com
|
||||
4chan.org
|
||||
orange.fr
|
||||
kapanlagi.com
|
||||
fedex.com
|
||||
uploaded.net
|
||||
azlyrics.com
|
||||
teepr.com
|
||||
webmd.com
|
||||
groupon.com
|
||||
wp.pl
|
||||
intuit.com
|
||||
mashable.com
|
||||
free.fr
|
||||
mercadolibre.com.ar
|
||||
eastday.com
|
||||
cricbuzz.com
|
||||
gizmodo.com
|
||||
billdesk.com
|
||||
ebay.com.au
|
||||
ci123.com
|
||||
dcinside.com
|
||||
hespress.com
|
||||
exoclick.com
|
||||
hola.com
|
||||
indianexpress.com
|
||||
youdao.com
|
||||
vk.me
|
||||
amazon.ca
|
||||
namu.wiki
|
||||
bhaskar.com
|
||||
theverge.com
|
||||
state.gov
|
||||
77
factory/resultant/top500_proxy.list
Normal file
77
factory/resultant/top500_proxy.list
Normal file
@@ -0,0 +1,77 @@
|
||||
youtube.com
|
||||
facebook.com
|
||||
twitter.com
|
||||
instagram.com
|
||||
blogspot.com
|
||||
tumblr.com
|
||||
pinterest.com
|
||||
blogger.com
|
||||
naver.com
|
||||
pornhub.com
|
||||
t.co
|
||||
fc2.com
|
||||
xvideos.com
|
||||
ok.ru
|
||||
dropbox.com
|
||||
flipkart.com
|
||||
pixnet.net
|
||||
bongacams.com
|
||||
nytimes.com
|
||||
vimeo.com
|
||||
ask.com
|
||||
txxx.com
|
||||
bet365.com
|
||||
dailymotion.com
|
||||
indiatimes.com
|
||||
thepiratebay.org
|
||||
slideshare.net
|
||||
bp.blogspot.com
|
||||
livedoor.jp
|
||||
hclips.com
|
||||
globo.com
|
||||
twimg.com
|
||||
chaturbate.com
|
||||
blogspot.in
|
||||
shutterstock.com
|
||||
fbcdn.net
|
||||
redtube.com
|
||||
upornia.com
|
||||
xnxx.com
|
||||
pinimg.com
|
||||
cloudfront.net
|
||||
extratorrent.cc
|
||||
archive.org
|
||||
files.wordpress.com
|
||||
doubleclick.net
|
||||
youporn.com
|
||||
flickr.com
|
||||
blogspot.com.br
|
||||
globaloffers.link
|
||||
scribd.com
|
||||
medium.com
|
||||
media.tumblr.com
|
||||
mega.nz
|
||||
bitauto.com
|
||||
messenger.com
|
||||
eksisozluk.com
|
||||
ltn.com.tw
|
||||
4dsply.com
|
||||
hootsuite.com
|
||||
tradeadexchange.com
|
||||
reddituploads.com
|
||||
wsj.com
|
||||
elpais.com
|
||||
blogspot.jp
|
||||
bestbuy.com
|
||||
telegram.org
|
||||
reuters.com
|
||||
nyaa.se
|
||||
ytimg.com
|
||||
zone-telechargement.com
|
||||
tube8.com
|
||||
beeg.com
|
||||
spankbang.com
|
||||
disqus.com
|
||||
4shared.com
|
||||
goo.gl
|
||||
hurriyet.com.tr
|
||||
2
factory/template/sr_foot.txt
Normal file
2
factory/template/sr_foot.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
# By Moshel @ https://hzy.pw/
|
||||
10
factory/template/sr_head.txt
Normal file
10
factory/template/sr_head.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
# Best Shadowrocket Rules (https://github.com/h2y/Shadowrocket-ADBlock-Rules)
|
||||
# by Moshel
|
||||
# build time: {{build_time}}
|
||||
|
||||
[General]
|
||||
bypass-system = true
|
||||
skip-proxy = 192.168.0.0/16,10.0.0.0/8,172.16.0.0/12,localhost,*.local,e.crashlynatics.com,captive.apple.com
|
||||
bypass-tun = 10.0.0.0/8,100.64.0.0/10,127.0.0.0/8,169.254.0.0/16,172.16.0.0/12,192.0.0.0/24,192.0.2.0/24,192.88.99.0/24,192.168.0.0/16,198.18.0.0/15,198.51.100.0/24,203.0.113.0/24,224.0.0.0/4,255.255.255.255/32
|
||||
dns-server =
|
||||
[Rule]
|
||||
21
factory/template/sr_top500_banlist.txt
Normal file
21
factory/template/sr_top500_banlist.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
#
|
||||
# 黑名单模式,对不确定的网站尽可能地直连
|
||||
#
|
||||
# 代理:top500 网站中不可直连的网站
|
||||
# 直连:中国网站、国外的其余网站
|
||||
# 不包含广告过滤
|
||||
#
|
||||
|
||||
|
||||
# top500 proxy
|
||||
{{top500_proxy}}
|
||||
|
||||
|
||||
# manual proxy
|
||||
{{manual_proxy}}
|
||||
|
||||
|
||||
DOMAIN-KEYWORD,google,Proxy
|
||||
|
||||
FINAL,direct
|
||||
29
factory/template/sr_top500_banlist_ad.txt
Normal file
29
factory/template/sr_top500_banlist_ad.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
|
||||
#
|
||||
# 黑名单模式,对不确定的网站尽可能地直连
|
||||
#
|
||||
# 代理:top500 网站中不可直连的网站
|
||||
# 直连:中国网站、国外的其余网站
|
||||
# 包含广告过滤
|
||||
#
|
||||
|
||||
|
||||
# top500 proxy
|
||||
{{top500_proxy}}
|
||||
|
||||
|
||||
# manual proxy
|
||||
{{manual_proxy}}
|
||||
|
||||
|
||||
# ad block
|
||||
{{ad}}
|
||||
|
||||
|
||||
# manual block
|
||||
{{manual_reject}}
|
||||
|
||||
|
||||
DOMAIN-KEYWORD,google,Proxy
|
||||
|
||||
FINAL,direct
|
||||
22
factory/template/sr_top500_whitelist.txt
Normal file
22
factory/template/sr_top500_whitelist.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
#
|
||||
# 白名单模式,对不确定的网站尽可能地走代理
|
||||
#
|
||||
# 直连:top500 网站中可直连的网站、中国网站
|
||||
# 代理:其余的所有国外网站
|
||||
# 不包含广告过滤
|
||||
#
|
||||
|
||||
|
||||
# top500 direct
|
||||
{{top500_direct}}
|
||||
|
||||
|
||||
# manual direct
|
||||
{{manual_direct}}
|
||||
|
||||
|
||||
DOMAIN-SUFFIX,cn,DIRECT
|
||||
|
||||
GEOIP,CN,DIRECT
|
||||
FINAL,proxy
|
||||
30
factory/template/sr_top500_whitelist_ad.txt
Normal file
30
factory/template/sr_top500_whitelist_ad.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
|
||||
#
|
||||
# 白名单模式,对不确定的网站尽可能地走代理
|
||||
#
|
||||
# 直连:top500 网站中可直连的网站、中国网站
|
||||
# 代理:其余的所有国外网站
|
||||
# 包含广告过滤
|
||||
#
|
||||
|
||||
|
||||
# top500 direct
|
||||
{{top500_direct}}
|
||||
|
||||
|
||||
# manual direct
|
||||
{{manual_direct}}
|
||||
|
||||
|
||||
# ad block
|
||||
{{ad}}
|
||||
|
||||
|
||||
# manual block
|
||||
{{manual_reject}}
|
||||
|
||||
|
||||
DOMAIN-SUFFIX,cn,DIRECT
|
||||
|
||||
GEOIP,CN,DIRECT
|
||||
FINAL,proxy
|
||||
127
factory/top500.py
Normal file
127
factory/top500.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import threading
|
||||
import time
|
||||
import sys
|
||||
import requests
|
||||
|
||||
|
||||
urls = ['http://alexa.chinaz.com/Global/index.html']
|
||||
for i in range(2,21):
|
||||
urls.append('http://alexa.chinaz.com/Global/index_%d.html'%i)
|
||||
|
||||
urls_scan_over = False
|
||||
|
||||
domains = []
|
||||
|
||||
|
||||
# thread to scan pages in urls
|
||||
class UrlScaner(threading.Thread):
|
||||
def __init__(self):
|
||||
threading.Thread.__init__(self)
|
||||
|
||||
def run(self):
|
||||
global urls_scan_over, urls
|
||||
|
||||
done_num = 0
|
||||
|
||||
while len(urls):
|
||||
html = self.fetchHTML( urls.pop(0) )
|
||||
self.praseHTML(html)
|
||||
|
||||
done_num = done_num + 25
|
||||
print('top500 已获取:%d/500'%done_num)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
urls_scan_over = True
|
||||
print('top500 网站获取完毕。')
|
||||
|
||||
|
||||
def fetchHTML(self, url):
|
||||
success = False
|
||||
try_times = 0
|
||||
while try_times < 5 and not success:
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
time.sleep(1)
|
||||
try_times = try_times + 1
|
||||
else:
|
||||
success = True
|
||||
break
|
||||
|
||||
if not success:
|
||||
sys.exit('error in request %s\n\treturn code: %d' % (url, r.status_code) )
|
||||
|
||||
r.encoding = 'utf-8'
|
||||
return r.text
|
||||
|
||||
|
||||
def praseHTML(self, html):
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
namesDom = soup.select("div.righttxt h3 span")
|
||||
|
||||
for name in namesDom:
|
||||
domains.append(name.string)
|
||||
|
||||
|
||||
requests_header = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-HK;q=0.6,zh-TW;q=0.4,en;q=0.2',
|
||||
'Connection': 'keep-alive'
|
||||
}
|
||||
|
||||
|
||||
# thread to visit websites
|
||||
class DomainScaner(threading.Thread):
|
||||
def __init__(self):
|
||||
threading.Thread.__init__(self)
|
||||
|
||||
def run(self):
|
||||
while not urls_scan_over or len(domains):
|
||||
if len(domains) == 0:
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
domain = domains.pop(0)
|
||||
|
||||
if domain.endswith('.cn'):
|
||||
continue
|
||||
if 'google' in domain:
|
||||
continue
|
||||
|
||||
is_proxy = False
|
||||
|
||||
try:
|
||||
requests.get('http://' + domain, timeout=10, headers=requests_header)
|
||||
except BaseException:
|
||||
try:
|
||||
requests.get('http://www.' + domain, timeout=10, headers=requests_header)
|
||||
except BaseException:
|
||||
is_proxy = True
|
||||
|
||||
if is_proxy:
|
||||
file_proxy.write(domain + '\n')
|
||||
else:
|
||||
file_direct.write(domain + '\n')
|
||||
|
||||
print('[剩余域名数量:%d]\tProxy %s:%s' % (len(domains), is_proxy, domain) )
|
||||
|
||||
|
||||
print('5s later to start refresh top500 lists...')
|
||||
time.sleep(5)
|
||||
|
||||
# output files
|
||||
file_proxy = open('resultant/top500_proxy.list', 'w', encoding='utf-8')
|
||||
file_direct = open('resultant/top500_direct.list', 'w', encoding='utf-8')
|
||||
|
||||
now_time = '# list refresh time: ' + time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
file_proxy.write(now_time + '\n')
|
||||
file_direct.write(now_time + '\n')
|
||||
|
||||
# Start Thread
|
||||
UrlScaner().start()
|
||||
for i in range(5):
|
||||
DomainScaner().start()
|
||||
Reference in New Issue
Block a user