一、项目简介


哈哈哈哈哈,最开始是我前女友给我提的一个需求,她的学姐要让她一张一张找昆虫图片,然后一个一个按照序列名称命名。她老跑过来和我抱怨说这玩意好烦,后面就想干脆给她写个爬虫自动化吧,不然一个一个找也太可怜了。当然这里要非常感谢我的好同学zz啦

二、使用方法


1.将你需要命名和搜索的图片名字准备好在txt文件里
2.确保文件夹里有以下的文件夹,image是输出图片的文件夹

3.vscode打开程序运行即可,最终图片会导入到image中

三、源码


开源链接https://github.com/Sigma-Pirrow/picture-crawling-and-rename-program

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
from bs4 import BeautifulSoup
import os
from PIL import Image
from io import BytesIO
import re
import json
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# 确保有一个文件夹来存储图片
os.makedirs('images', exist_ok=True)

# 读取文本文件
with open('insects.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()

# 设置重试策略
retry_strategy = Retry(
total=3, # 总共重试3次
backoff_factor=1, # 重试间隔时间因子
status_forcelist=[429, 500, 502, 503, 504], # 需要重试的状态码
allowed_methods=["HEAD", "GET", "OPTIONS"] # 需要重试的请求方法
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

for line in lines:
# 提取中文部分
chinese_name = ''.join(filter(lambda x: u'\u4e00' <= x <= u'\u9fff', line))

# 使用中文名进行搜索
search_url = f"https://cn.bing.com/images/async?q={chinese_name}" # 替换为实际的搜索引擎URL

try:
response = http.get(search_url)
response.raise_for_status() # 检查请求是否成功
except requests.exceptions.RequestException as e:
print(f"Failed to fetch search results for {chinese_name}: {e}")
continue

soup = BeautifulSoup(response.text, 'html.parser')

# 找到前五张图片的URL(具体选择器需要根据实际网页结构调整)
image_tags = soup.find_all('a', {'class': 'iusc'}, limit=3)
image_urls = []

for tag in image_tags:
try:
m_data = json.loads(tag['m'])
image_url = m_data.get('murl')
if image_url and not re.match(r'^data:', image_url):
image_urls.append(image_url)
except (KeyError, json.JSONDecodeError):
continue

if image_urls:
best_image_url = None
best_image_pixels = 0

# 尝试下载图片,直到成功或所有图片都尝试完毕
for selected_image_url in image_urls:
try:
img_response = http.get(selected_image_url)
img_response.raise_for_status() # 检查请求是否成功
except requests.exceptions.RequestException as e:
print(f"Failed to fetch image from URL: {selected_image_url}: {e}")
continue

try:
img = Image.open(BytesIO(img_response.content))
# 如果图片是RGBA模式,转换为RGB模式
if img.mode == 'RGBA':
img = img.convert('RGB')

# 计算总像素
img_pixels = img.width * img.height

# 更新最佳图片
if img_pixels > best_image_pixels:
best_image_url = selected_image_url
best_image_pixels = img_pixels
best_img = img
except IOError:
print(f"Failed to open image from URL: {selected_image_url}")
continue

if best_image_url:
# 保存最佳图片
img_name = line.strip() + '.jpg'
best_img.save(os.path.join('images', img_name))
print(f"Saved {img_name}")
else:
print(f"No valid image URL found for {chinese_name}")
else:
print(f"No valid image URL found for {chinese_name}")