一、项目简介

哈哈哈哈哈，最开始是我前女友给我提的一个需求，她的学姐要让她一张一张找昆虫图片，然后一个一个按照序列名称命名。她老跑过来和我抱怨说这玩意好烦，后面就想干脆给她写个爬虫自动化吧，不然一个一个找也太可怜了。当然这里要非常感谢我的好同学zz啦

二、使用方法

1.将你需要命名和搜索的图片名字准备好在txt文件里
2.确保文件夹里有以下的文件夹，image是输出图片的文件夹

3.vscode打开程序运行即可，最终图片会导入到image中

三、源码

开源链接：https://github.com/Sigma-Pirrow/picture-crawling-and-rename-program

import requests
from bs4 import BeautifulSoup
import os
from PIL import Image
from io import BytesIO
import re
import json
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# 确保有一个文件夹来存储图片
os.makedirs('images', exist_ok=True)

# 读取文本文件
with open('insects.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 设置重试策略
retry_strategy = Retry(
    total=3,  # 总共重试3次
    backoff_factor=1,  # 重试间隔时间因子
    status_forcelist=[429, 500, 502, 503, 504],  # 需要重试的状态码
    allowed_methods=["HEAD", "GET", "OPTIONS"]  # 需要重试的请求方法
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

for line in lines:
    # 提取中文部分
    chinese_name = ''.join(filter(lambda x: u'\u4e00' <= x <= u'\u9fff', line))
    
    # 使用中文名进行搜索
    search_url = f"https://cn.bing.com/images/async?q={chinese_name}"  # 替换为实际的搜索引擎URL
    
    try:
        response = http.get(search_url)
        response.raise_for_status()  # 检查请求是否成功
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch search results for {chinese_name}: {e}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')

    # 找到前五张图片的URL（具体选择器需要根据实际网页结构调整）
    image_tags = soup.find_all('a', {'class': 'iusc'}, limit=3)
    image_urls = []

    for tag in image_tags:
        try:
            m_data = json.loads(tag['m'])
            image_url = m_data.get('murl')
            if image_url and not re.match(r'^data:', image_url):
                image_urls.append(image_url)
        except (KeyError, json.JSONDecodeError):
            continue

    if image_urls:
        best_image_url = None
        best_image_pixels = 0

        # 尝试下载图片，直到成功或所有图片都尝试完毕
        for selected_image_url in image_urls:
            try:
                img_response = http.get(selected_image_url)
                img_response.raise_for_status()  # 检查请求是否成功
            except requests.exceptions.RequestException as e:
                print(f"Failed to fetch image from URL: {selected_image_url}: {e}")
                continue
            
            try:
                img = Image.open(BytesIO(img_response.content))
                # 如果图片是RGBA模式，转换为RGB模式
                if img.mode == 'RGBA':
                    img = img.convert('RGB')

                # 计算总像素
                img_pixels = img.width * img.height

                # 更新最佳图片
                if img_pixels > best_image_pixels:
                    best_image_url = selected_image_url
                    best_image_pixels = img_pixels
                    best_img = img
            except IOError:
                print(f"Failed to open image from URL: {selected_image_url}")
                continue
        
        if best_image_url:
            # 保存最佳图片
            img_name = line.strip() + '.jpg'
            best_img.save(os.path.join('images', img_name))
            print(f"Saved {img_name}")
        else:
            print(f"No valid image URL found for {chinese_name}")
    else:
        print(f"No valid image URL found for {chinese_name}")