You need to enable JavaScript to run this app.
最新活动
大模型
产品
解决方案
定价
生态与合作
支持与服务
开发者
了解我们

Python图片爬取故障求助:404错误与未知URL协议问题

解决图片爬取时的404、URL Scheme错误及document.write警告问题

先聊聊你遇到的几个问题根源

  • 404错误:要么是部分图片的src路径本身已失效(网站删除了对应图片),要么是URL拼接时出错,还有可能是请求头不够完整被网站反爬拦截
  • net::ERR_UNKNOWN_URL_SCHEME:这是因为你拿到的src值不是标准的HTTP/HTTPS链接——比如是空字符串、data:image开头的base64图片,或是其他非HTTP协议地址,直接拼接域名后就变成了无效URL
  • [Violation] Avoid using document.write():目标网站用document.write()动态生成内容,而requests是静态爬取,只能拿到页面初始HTML,可能没获取到真实的图片链接,导致后续拿到错误的src

修复后的基础版代码(基于原代码优化)

我把你的代码做了精简,添加了异常处理、无效链接过滤,还完善了请求头来模拟真实浏览器:

import requests
from bs4 import BeautifulSoup as bs
import os

# 用os.path.join适配不同系统的路径,避免手动拼接出错
save_path = os.path.join(os.path.expanduser("~"), "Desktop", "src_code", "Python_projects", "python", "web_scrap", "myPath")
# 确保保存目录存在,不存在就自动创建
os.makedirs(save_path, exist_ok=True)

url = 'https://goodlogo.com/top.250/n/250/interval/6'
# 更完整的请求头,降低被反爬识别的概率
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Referer': 'https://goodlogo.com/'
}

try:
    # 请求页面并主动触发HTTP错误,方便排查问题
    sourcecode = requests.get(url, headers=headers, timeout=10)
    sourcecode.raise_for_status()
    soup = bs(sourcecode.text, 'html.parser')

    # 合并两个选择器,减少重复代码
    image_tags = soup.select(".top_s3l, .top_s3")
    for tag in image_tags:
        img_src = tag.get('src')
        # 过滤无效链接:为空、不是以/开头的(避免非HTTP协议链接)
        if not img_src or not img_src.startswith('/'):
            print(f"跳过无效链接:{img_src}")
            continue
        
        # 拼接完整图片URL
        full_img_url = f'https://goodlogo.com{img_src}'
        # 生成文件名:去掉路径前缀,避免重复目录结构
        img_filename = img_src.replace("/images/logos/small/", "")
        full_save_path = os.path.join(save_path, img_filename)

        try:
            # 下载图片并处理异常
            img_response = requests.get(full_img_url, headers=headers, timeout=10)
            img_response.raise_for_status()
            # 用with open自动关闭文件,更安全
            with open(full_save_path, "wb") as f:
                f.write(img_response.content)
            print(f"成功保存:{full_save_path}")
        except requests.exceptions.RequestException as e:
            print(f"下载图片失败 {full_img_url}:{str(e)}")
            continue

except requests.exceptions.RequestException as e:
    print(f"请求页面失败:{str(e)}")

针对动态内容的进阶方案(解决document.write问题)

如果上面的代码还是拿不到正确的图片链接,说明网站用document.write()动态加载了图片,这时候需要用selenium模拟浏览器渲染页面:

  1. 先安装依赖:pip install selenium
  2. 下载对应浏览器的驱动(比如Chrome的chromedriver),放到Python环境路径或指定路径
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import requests
import os

save_path = os.path.join(os.path.expanduser("~"), "Desktop", "src_code", "Python_projects", "python", "web_scrap", "myPath")
os.makedirs(save_path, exist_ok=True)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Referer': 'https://goodlogo.com/'
}

# 配置无头浏览器,不弹出窗口
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument(f"user-agent={headers['User-Agent']}")

driver = webdriver.Chrome(options=chrome_options)
try:
    driver.get('https://goodlogo.com/top.250/n/250/interval/6')
    # 等待页面加载完成(可根据实际情况调整等待时间)
    driver.implicitly_wait(10)
    # 获取渲染后的完整页面源码
    page_source = driver.page_source
    soup = bs(page_source, 'html.parser')

    image_tags = soup.select(".top_s3l, .top_s3")
    for tag in image_tags:
        img_src = tag.get('src')
        if not img_src or not img_src.startswith('/'):
            print(f"跳过无效链接:{img_src}")
            continue
        
        full_img_url = f'https://goodlogo.com{img_src}'
        img_filename = img_src.replace("/images/logos/small/", "")
        full_save_path = os.path.join(save_path, img_filename)

        try:
            img_response = requests.get(full_img_url, headers=headers, timeout=10)
            img_response.raise_for_status()
            with open(full_save_path, "wb") as f:
                f.write(img_response.content)
            print(f"成功保存:{full_save_path}")
        except requests.exceptions.RequestException as e:
            print(f"下载图片失败 {full_img_url}:{str(e)}")
            continue

finally:
    # 无论成功失败都关闭浏览器
    driver.quit()

关键修改点说明

  1. os.path.join处理路径,避免Windows和Linux的路径分隔符冲突
  2. 增加目录自动创建逻辑,不用手动提前建文件夹
  3. 完善请求头,模拟真实浏览器行为,降低被反爬拦截的概率
  4. 添加异常处理,遇到错误不会直接终止程序,还能打印错误信息方便排查
  5. 过滤无效的src链接,避免生成错误的URL
  6. with open替代手动close(),避免文件未正常关闭的问题

内容的提问来源于stack exchange,提问作者Roy Sukrit

火山引擎 最新活动