起因是自己写的服务端爬取的数据源只有大陆可以访问,但是服务器在国,所以不得不在国外挂代理访问国内的数据源,所以单独写了个小脚本,获取网上可以使用的代理地址
因为是自己用的,所以代码有些潦草,还请大佬们手下留情,多多指教。
用法
在你需要注入的脚本内启动这个线程,然后在需要时读取目录下的proxies.json文件进行使用
示例
注:记得提前在目录下创建一个proxies.json文件,不然可能会报错
proxies.py:
import json
import time
from threading import Thread
import bs4
import requests
from constant import REQUESTS_HEADERS
class GetProxiesThread(Thread):
def __init__(self):
"""
:param func: 可调用的对象
:param args: 可调用对象的参数
"""
Thread.__init__(self)
self.proxies_list = []
def kxdaili1(self):
ip_list = []
ip_ports = []
ip_types = []
for page in range(1, 11):
res = requests.get("http://www.kxdaili.com/dailiip/2/" + str(page) + ".html")
soup = bs4.BeautifulSoup(res.text, 'lxml')
ip_list += soup.select(
'body > div.banner-box > div.header-container > div.domain-block.price-block > div.auto > div.hot-product > div.hot-product-content > table > tbody > tr > td:nth-child(1)')
ip_ports += soup.select(
'body > div.banner-box > div.header-container > div.domain-block.price-block > div.auto > div.hot-product > div.hot-product-content > table > tbody > tr > td:nth-child(2)')
ip_types += soup.select(
'body > div.banner-box > div.header-container > div.domain-block.price-block > div.auto > div.hot-product > div.hot-product-content > table > tbody > tr > td:nth-child(4)')
true_ip_list = []
for index in range(len(ip_list)):
ip = ip_list[index].text
port = ip_ports[index].text
ip_type = ip_types[index].text
if ip_type == "HTTP,HTTPS":
try:
proxies = {
'http': 'http://' + ip + ':' + port,
'https': 'http://' + ip + ':' + port
}
res = requests.get('https://www.wenku8.net/novel/2/2152/index.htm', headers=REQUESTS_HEADERS,
proxies=proxies, timeout=3)
if res is not None:
true_ip_list.append({
'http': 'http://' + ip + ':' + port,
'https': 'http://' + ip + ':' + port
})
except:
pass
print("proxies update:", true_ip_list)
return true_ip_list
def kxdaili2(self, list_len):
if list_len >= 3:
return []
ip_list = []
ip_ports = []
ip_types = []
for page in range(1, 11):
res = requests.get("http://www.kxdaili.com/dailiip/1/" + str(page) + ".html")
soup = bs4.BeautifulSoup(res.text, 'lxml')
ip_list += soup.select(
'body > div.banner-box > div.header-container > div.domain-block.price-block > div.auto > div.hot-product > div.hot-product-content > table > tbody > tr > td:nth-child(1)')
ip_ports += soup.select(
'body > div.banner-box > div.header-container > div.domain-block.price-block > div.auto > div.hot-product > div.hot-product-content > table > tbody > tr > td:nth-child(2)')
ip_types += soup.select(
'body > div.banner-box > div.header-container > div.domain-block.price-block > div.auto > div.hot-product > div.hot-product-content > table > tbody > tr > td:nth-child(4)')
true_ip_list = []
for index in range(len(ip_list)):
ip = ip_list[index].text
port = ip_ports[index].text
ip_type = ip_types[index].text
if ip_type == "HTTP,HTTPS":
try:
proxies = {
'http': 'http://' + ip + ':' + port,
'https': 'http://' + ip + ':' + port
}
res = requests.get('https://www.wenku8.net/novel/2/2152/index.htm', headers=REQUESTS_HEADERS,
proxies=proxies, timeout=3)
if res is not None:
true_ip_list.append({
'http': 'http://' + ip + ':' + port,
'https': 'http://' + ip + ':' + port
})
except:
pass
print("proxies update:", true_ip_list)
return true_ip_list
def run(self):
while True:
self.proxies_list = self.kxdaili1()
self.proxies_list += self.kxdaili2(len(self.proxies_list))
if self.proxies_list != []:
with open("proxies.json", "w") as file:
file.write(json.dumps(self.proxies_list, indent=2))
time.sleep(3)
用法
在你需要注入的脚本内启动这个线程,然后在需要时读取目录下的proxies.json文件进行使用
示例
main.py:
# 你开几个线程无所谓,我这边开三个,代理的更新速度会快一些
import json
import random
import requests
get_proxies = [GetProxiesThread(), GetProxiesThread(), GetProxiesThread()]
# 读取文件
with open("proxies.json", "r") as file:
proxies = json.loads(file.read())
# 随机选一个代理
proxies = proxies[random.randint(0, len(proxies))]
# 发送请求
res = requests.get("http://www.baidu.com/", proxies=proxies)
print(res.text)
注:记得提前在目录下创建一个proxies.json文件,不然可能会报错
最后编辑: