如何不受阻碍地爬取受限网页的标题？（含反爬及地区限制）

阿华AIGC实验室

2026-5-9

解决403封禁，稳定爬取目标网页标题的方案

我看了你的代码和遇到的问题，403错误主要是因为目标网站的反爬机制识别出了爬虫请求，再加上你用的免费公开代理质量参差不齐，才会出现偶尔成功但大部分时候失败的情况。下面我拆解问题并给出可行的改进方案：

1. 代理质量与有效性是核心问题

你现在从sslproxies.org获取的代理大多是公开免费的，很多已经失效或者被目标网站拉黑了，而且你没有提前验证代理是否能正常访问目标网站，直接拿来用自然成功率低。另外，目标网站仅允许特定国家地区访问，你还需要筛选对应地区的代理（比如网站只允许美国IP，就优先选US地区的代理）。

2. 单薄的请求头容易被识别为爬虫

你的请求头只有User-Agent，这和真实浏览器的请求差异太大，很容易被反爬机制拦截。需要补充更多浏览器会发送的请求头，模拟真实访问行为。

3. 重试逻辑需要优化

当前的重试逻辑没有控制请求频率，频繁换代理请求会触发网站的反爬阈值；而且当代理列表耗尽时，代码会陷入死循环，没有 fallback 机制。

改进后的完整代码

import random
import time
import requests
from bs4 import BeautifulSoup

link = 'https://www.veteranownedbusiness.com/business/25150/bm-wendling-real-estate'
# 模拟真实浏览器的请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.veteranownedbusiness.com/",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}

def get_proxy_list(target_country="US"):
    """获取指定国家的HTTPS代理（目标网站允许的地区）"""
    proxies = []
    try:
        r = requests.get('https://www.sslproxies.org/', headers=headers, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        # 跳过表头，筛选指定国家、支持HTTPS的代理
        for row in soup.select("table.table tr")[1:]:
            cols = row.select("td")
            if len(cols) < 7:
                continue
            ip = cols[0].text.strip()
            port = cols[1].text.strip()
            country = cols[2].text.strip()
            https_support = cols[6].text.strip()
            if country == target_country and https_support == "yes":
                proxies.append(f"{ip}:{port}")
    except Exception as e:
        print(f"获取代理列表失败: {str(e)}")
    return proxies

def validate_proxy(proxy, target_url):
    """验证代理是否能正常访问目标网站（避免403）"""
    try:
        proxy_dict = {'https': f'http://{proxy}', 'http': f'http://{proxy}'}
        r = requests.get(target_url, headers=headers, proxies=proxy_dict, timeout=8)
        # 只要不是403就认为代理可用
        return r.status_code != 403
    except:
        return False

def get_valid_proxies(proxy_list, target_url):
    """过滤出有效的代理，减少无效尝试"""
    valid_proxies = []
    for proxy in proxy_list:
        if validate_proxy(proxy, target_url):
            valid_proxies.append(proxy)
            # 不用验证太多，取5个能用的就足够
            if len(valid_proxies) >= 5:
                break
    return valid_proxies

def scrape(target_url):
    while True:
        # 获取代理列表并验证有效性
        proxy_list = get_proxy_list()
        if not proxy_list:
            print("未获取到代理列表，等待5秒后重试...")
            time.sleep(5)
            continue
        valid_proxies = get_valid_proxies(proxy_list, target_url)
        if not valid_proxies:
            print("没有可用的代理，等待5秒后重试...")
            time.sleep(5)
            continue
        
        # 随机选择可用代理
        proxy = random.choice(valid_proxies)
        proxy_dict = {'https': f'http://{proxy}', 'http': f'http://{proxy}'}
        print(f"正在使用代理: {proxy}")
        
        try:
            # 添加随机延迟，模拟人类访问节奏
            time.sleep(random.uniform(2, 5))
            r = requests.get(target_url, headers=headers, proxies=proxy_dict, timeout=10)
            r.raise_for_status()  # 抛出HTTP错误异常
            soup = BeautifulSoup(r.text, "html.parser")
            title = soup.select_one(".bizname_hdr > h1").get_text(strip=True)
            return title
        except requests.exceptions.HTTPError as e:
            print(f"请求失败，状态码: {r.status_code}，更换代理...")
            valid_proxies.remove(proxy)
        except Exception as e:
            print(f"请求出错: {str(e)}，更换代理...")
            if proxy in valid_proxies:
                valid_proxies.remove(proxy)

if __name__ == "__main__":
    try:
        title = scrape(link)
        print(f"爬取到的标题: {title}")
    except KeyboardInterrupt:
        print("程序被用户终止")