Python爬虫-反扒技术(构建ip代理池)

3月 13, 2020

爬取快代理获取的一个ip代理池

import requests
import parsel
import time
import random

def user_agent():
    '''
    随机一个User-Agent
    :return: 返回一个User-Agent
    '''

    user_agent_pools = [
        # Edge Trident内核 on win10
        {'User-Agent': 'Mozilla/5.0 (Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'},
        # Edge chromium内核 on win10
        {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36 Edg/80.0.361.54'},
        # Chrome on win10
        {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'},
        # Safari on mac
        {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27'},
        # FireFox on windows
        {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0'},
        # Opera on mac
        {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.9.168 Version/11.52'}
    ]
    rand_user_agent= random.randint(0,len(user_agent_pools)-1)
    return user_agent_pools[rand_user_agent]

def check_ip(proxies_list):
    '''
    检测代理ip质量
    :param proxies_list: 传入一个代理ip列表
    :return: 高质量代理ip列表
    '''

    ip_heightquality = []
    for proxy in proxies_list:
        # print(proxy)
        try:
            response = requests.get('https://www.baidu.com',headers=user_agent(),proxies=proxy,timeout=0.1)
            if response.status_code == 200:
                ip_heightquality.append(proxy)
        except Exception as e:
             continue
    return ip_heightquality

def rand_proxy(proxy_list):
    '''
    随机一个高质量代理ip
    :param proxy_list: 传入高质量代理ip列表
    :return: 返回一个代理ip
    '''
    rand_proxy = random.randint(0,len(proxy_list)-1)
    # print(rand_proxy)
    return proxy_list[rand_proxy]

def response():
    proxies_list = []
    # 构建访问页数
    for page in range(1,2):
        url = f'https://www.kuaidaili.com/free/inha/{page}/'
        response = requests.get(url,headers=user_agent())
        data = response.text
        # print(response)

        html_data = parsel.Selector(data)
        # print(html_data)
        parse_list = html_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
        # print(parse_list)

        # 代理ip的形式{'协议类型':'ip:端口'}
        for tr in parse_list:
            proxies_dict = {}
            http_type = tr.xpath('./td[4]/text()').extract_first()
            ip = tr.xpath('./td[1]/text()').extract_first()
            port = tr.xpath('./td[2]/text()').extract_first()
            # print(http_type,ip,port)

            # 构建字典
            proxies_dict[http_type] = ip+":"+port
            # print(proxies_dict)
            proxies_list.append(proxies_dict)
            time.sleep(0.5)
    return proxies_list

# print(proxies_list,len(proxies_list))
# proxies_use = check_ip(proxies_list)
# print(proxies_use,len(proxies_use))
# print(user_agent())
# print(check_ip(response()))
# print(rand_proxy(check_ip(response())))

def proxy():
    '''
    封装代理ip
    :return: 返回一个代理ip列表
    '''
    return rand_proxy(check_ip(response()))

# print(proxy())

食用方法

  1. 将上面文件命名为proxy_pools.py并保存在和需要调用的同一目录

  2. import proxy_pools
    print(proxy_pools.proxy())
    

Github