爬取快代理获取的一个ip代理池
import requests
import parsel
import time
import random
def user_agent():
'''
随机一个User-Agent
:return: 返回一个User-Agent
'''
user_agent_pools = [
# Edge Trident内核 on win10
{'User-Agent': 'Mozilla/5.0 (Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'},
# Edge chromium内核 on win10
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36 Edg/80.0.361.54'},
# Chrome on win10
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'},
# Safari on mac
{'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27'},
# FireFox on windows
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0'},
# Opera on mac
{'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.9.168 Version/11.52'}
]
rand_user_agent= random.randint(0,len(user_agent_pools)-1)
return user_agent_pools[rand_user_agent]
def check_ip(proxies_list):
'''
检测代理ip质量
:param proxies_list: 传入一个代理ip列表
:return: 高质量代理ip列表
'''
ip_heightquality = []
for proxy in proxies_list:
# print(proxy)
try:
response = requests.get('https://www.baidu.com',headers=user_agent(),proxies=proxy,timeout=0.1)
if response.status_code == 200:
ip_heightquality.append(proxy)
except Exception as e:
continue
return ip_heightquality
def rand_proxy(proxy_list):
'''
随机一个高质量代理ip
:param proxy_list: 传入高质量代理ip列表
:return: 返回一个代理ip
'''
rand_proxy = random.randint(0,len(proxy_list)-1)
# print(rand_proxy)
return proxy_list[rand_proxy]
def response():
proxies_list = []
# 构建访问页数
for page in range(1,2):
url = f'https://www.kuaidaili.com/free/inha/{page}/'
response = requests.get(url,headers=user_agent())
data = response.text
# print(response)
html_data = parsel.Selector(data)
# print(html_data)
parse_list = html_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
# print(parse_list)
# 代理ip的形式{'协议类型':'ip:端口'}
for tr in parse_list:
proxies_dict = {}
http_type = tr.xpath('./td[4]/text()').extract_first()
ip = tr.xpath('./td[1]/text()').extract_first()
port = tr.xpath('./td[2]/text()').extract_first()
# print(http_type,ip,port)
# 构建字典
proxies_dict[http_type] = ip+":"+port
# print(proxies_dict)
proxies_list.append(proxies_dict)
time.sleep(0.5)
return proxies_list
# print(proxies_list,len(proxies_list))
# proxies_use = check_ip(proxies_list)
# print(proxies_use,len(proxies_use))
# print(user_agent())
# print(check_ip(response()))
# print(rand_proxy(check_ip(response())))
def proxy():
'''
封装代理ip
:return: 返回一个代理ip列表
'''
return rand_proxy(check_ip(response()))
# print(proxy())
食用方法
将上面文件命名为proxy_pools.py并保存在和需要调用的同一目录
import proxy_pools print(proxy_pools.proxy())