Python爬取4chan.org图片的实用方法
https://github.com/pmthink/get_pic_for_4chan
用法:
git clone https://github.com/pmthink/get_pic_for_4chan.git
cd get_pic_for_4chan
pip3 install -r install.txt
python3 get.py
Python爬取4chan.org图片的实用方法
你们肯定不是为了爬 diy 频道的图片…
要爬4chan的图,直接用requests配合BeautifulSoup就行。这里给你个完整可用的脚本:
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin
def download_4chan_images(board, thread_id, save_dir='4chan_images'):
"""
下载4chan指定帖子的所有图片
Args:
board: 板块名,比如 'g'、'a'、'b'
thread_id: 帖子ID
save_dir: 保存图片的目录
"""
# 创建保存目录
os.makedirs(save_dir, exist_ok=True)
# 构造帖子URL
base_url = f'https://boards.4chan.org/{board}/thread/{thread_id}'
try:
# 获取页面内容
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(base_url, headers=headers)
response.raise_for_status()
# 解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 查找所有图片链接(4chan的图片在fileThumb类中)
image_links = []
for link in soup.find_all('a', class_='fileThumb'):
href = link.get('href')
if href and href.startswith('//'):
# 补全URL
full_url = 'https:' + href
image_links.append(full_url)
print(f"找到 {len(image_links)} 张图片")
# 下载图片
for i, img_url in enumerate(image_links, 1):
try:
# 获取文件名
filename = os.path.join(save_dir, img_url.split('/')[-1])
# 下载图片
img_response = requests.get(img_url, headers=headers)
img_response.raise_for_status()
# 保存图片
with open(filename, 'wb') as f:
f.write(img_response.content)
print(f"已下载: {filename} ({i}/{len(image_links)})")
# 礼貌性延迟,避免请求过快
time.sleep(0.5)
except Exception as e:
print(f"下载失败 {img_url}: {e}")
except Exception as e:
print(f"获取帖子失败: {e}")
# 使用示例
if __name__ == "__main__":
# 下载/g/板块某个帖子的图片
download_4chan_images('g', '123456789', 'downloaded_images')
这个脚本的工作原理:
- 构造目标帖子的URL(需要知道板块名和帖子ID)
- 用requests获取页面HTML
- 用BeautifulSoup解析,找到所有
fileThumb类的链接(这是4chan图片的缩略图链接) - 把相对URL转成完整URL
- 逐个下载图片并保存
要使用的话,先装依赖:
pip install requests beautifulsoup4
然后改一下download_4chan_images('g', '123456789')里的参数:
- 第一个参数是板块,比如
'g'、'a'、'b'等 - 第二个参数是帖子ID,在浏览器地址栏能看到
- 第三个参数是保存目录,可选
注意点:
- 加了User-Agent头,避免被简单屏蔽
- 有错误处理,下载失败会继续下一个
- 加了0.5秒延迟,避免请求太快
如果要爬整个板块的图片,可以先用requests.get(f'https://boards.4chan.org/{board}/')获取板块页面,解析出所有帖子链接,再对每个帖子调用这个函数。
总结:requests+BeautifulSoup就能搞定4chan图片爬取。
/gif 不错
老司机,666 ~
收藏
$ python3 get.py
http://boards.4chan.org/diy/
Traceback (most recent call last):
File “C:\developer\Python36\lib\site-packages\urllib3<a target=”_blank" href=“http://connectionpool.py” rel=“nofollow noopener”>connectionpool.py", line 600, in urlopen
chunked=chunked)
File “C:\developer\Python36\lib\site-packages\urllib3<a target=”_blank" href=“http://connectionpool.py” rel=“nofollow noopener”>connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File “<string>”, line 2, in raise_from
File “C:\developer\Python36\lib\site-packages\urllib3<a target=”_blank" href=“http://connectionpool.py” rel=“nofollow noopener”>connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File “C:\developer\Python36\lib\http<a target=”_blank" href=“http://client.py” rel=“nofollow noopener”>client.py", line 1331, in getresponse
response.begin()
File “C:\developer\Python36\lib\http<a target=”_blank" href=“http://client.py” rel=“nofollow noopener”>client.py", line 297, in begin
version, status, reason = self._read_status()
File “C:\developer\Python36\lib\http<a target=”_blank" href=“http://client.py” rel=“nofollow noopener”>client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), “iso-8859-1”)
File “C:\developer\Python36\lib<a target=”_blank" href=“http://socket.py” rel=“nofollow noopener”>socket.py", line 586, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] Զ▒▒▒▒▒▒ǿ▒ȹر▒▒▒һ▒▒▒▒▒е▒▒▒▒ӡ▒
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “C:\developer\Python36\lib\site-packages\requests<a target=”_blank" href=“http://adapters.py” rel=“nofollow noopener”>adapters.py", line 445, in send
timeout=timeout
File “C:\developer\Python36\lib\site-packages\urllib3<a target=”_blank" href=“http://connectionpool.py” rel=“nofollow noopener”>connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File “C:\developer\Python36\lib\site-packages\urllib3\util<a target=”_blank" href=“http://retry.py” rel=“nofollow noopener”>retry.py", line 367, in increment
raise six.reraise(type(error), error, _stacktrace)
File “C:\developer\Python36\lib\site-packages\urllib3\packages<a target=”_blank" href=“http://six.py” rel=“nofollow noopener”>six.py", line 685, in reraise
raise value.with_traceback(tb)
File “C:\developer\Python36\lib\site-packages\urllib3<a target=”_blank" href=“http://connectionpool.py” rel=“nofollow noopener”>connectionpool.py", line 600, in urlopen
chunked=chunked)
File “C:\developer\Python36\lib\site-packages\urllib3<a target=”_blank" href=“http://connectionpool.py” rel=“nofollow noopener”>connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File “<string>”, line 2, in raise_from
File “C:\developer\Python36\lib\site-packages\urllib3<a target=”_blank" href=“http://connectionpool.py” rel=“nofollow noopener”>connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File “C:\developer\Python36\lib\http<a target=”_blank" href=“http://client.py” rel=“nofollow noopener”>client.py", line 1331, in getresponse
response.begin()
File “C:\developer\Python36\lib\http<a target=”_blank" href=“http://client.py” rel=“nofollow noopener”>client.py", line 297, in begin
version, status, reason = self._read_status()
File “C:\developer\Python36\lib\http<a target=”_blank" href=“http://client.py” rel=“nofollow noopener”>client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), “iso-8859-1”)
File “C:\developer\Python36\lib<a target=”_blank" href=“http://socket.py” rel=“nofollow noopener”>socket.py", line 586, in readinto
return self._sock.recv_into(b)
urllib3.exceptions.ProtocolError: (‘Connection aborted.’, ConnectionResetError(10054, ‘Զ▒▒▒▒▒▒ǿ▒ȹر▒▒▒һ▒▒▒▒▒е▒▒▒▒ӡ▒’, None, 10054, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “get.py”, line 44, in <module>
get_pic(“diy”)
File “get.py”, line 31, in get_pic
get_pics_from_url(channel_url)
File “get.py”, line 16, in get_pics_from_url
r = requests.get(url)
File “C:\developer\Python36\lib\site-packages\requests<a target=”_blank" href=“http://api.py” rel=“nofollow noopener”>api.py", line 72, in get
return request(‘get’, url, params=params, **kwargs)
File “C:\developer\Python36\lib\site-packages\requests<a target=”_blank" href=“http://api.py” rel=“nofollow noopener”>api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File “C:\developer\Python36\lib\site-packages\requests<a target=”_blank" href=“http://sessions.py” rel=“nofollow noopener”>sessions.py", line 512, in request
resp = self.send(prep, **send_kwargs)
File “C:\developer\Python36\lib\site-packages\requests<a target=”_blank" href=“http://sessions.py” rel=“nofollow noopener”>sessions.py", line 622, in send
r = adapter.send(request, **kwargs)
File “C:\developer\Python36\lib\site-packages\requests<a target=”_blank" href=“http://adapters.py” rel=“nofollow noopener”>adapters.py", line 495, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: (‘Connection aborted.’, ConnectionResetError(10054, ‘Զ▒▒▒▒▒▒ǿ▒ȹر▒▒▒һ▒▒▒▒▒е▒▒▒▒ӡ▒’, None, 10054, None))
if you use linux
you can :
python3 get2.py
it so fast

