报错信息
urllib.error.HTTPError: HTTP Error 403: Forbidden
import requests
from pyquery import PyQuery as pq
from urllib.request import urlretrieve
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url = "https://list.jd.com/list.html?cat=9987,653,655"
headers = {"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
res = requests.get(url,headers=headers)
content = res.text
doc = pq(content)
imlist = doc("img[width='220'][height='220']")
m = 1
for im in imlist.items():
#获取图片 url 地址
imurl = "https:" + im.attr("data-lazy-img")
print(imurl)
#存储图片
urlretrieve(imurl,'./mypic/p'+str(m)+".jpg")
m+=1
出现该错误的原因是服务器开启了反爬虫,一般情况下只需要设置 header 模拟浏览器即可,但是 urlretrieve 并未提供 header 参数。
修改
把 urlretrieve
这一行代码改成下面就可以了
with requests.get(imurl,stream=True,headers=headers) as ir:
with open("./mypic/p"+str(m)+".jpg","wb") as f:
for chunk in ir:
f.write(chunk)
正文完