爬虫网站:百度图片搜索
小小代码,直接奉上源码
import requests
from fake_useragent import UserAgent
from urllib.parse import urlencode
import json
import time
import os
headers = {
"User-Agent": UserAgent().random
}
def get_photo(url, page, n):
response = requests.get(url, headers=headers, timeout=5)
path = os.getcwd() + "\\photo"
if not os.path.exists(os.path.join(os.getcwd(), "photo")):
os.mkdir(os.getcwd() + "\\photo")
with open(path + "\\{}_{}.jpg".format(page, n), "wb") as f:
f.write(response.content)
def spider(base_url, pn, time_stamp, gsm, keyword):
params = {
"tn": "resultjson_com",
"logid": "7151036769869146123",
"ipn": "rj",
"ct": "201326592",
"is": "",
"fp": "result",
"queryWord": keyword + "头像",
"cl": "2",
"lm": "-1",
"ie": "utf-8",
"oe": "utf-8",
"adpicid": "",
"st": "",
"z": "",
"ic": "",
"hd": "",
"latest": "",
"copyright": "",
"word": keyword + "头像",
"s": "",
"se": "",
"tab": "",
"width": "",
"height": "",
"face": "",
"istype": "",
"qc": "",
"nc": "",
"fr": "",
"expermode": "",
"nojc": "",
"cg": "head",
"pn": str(pn*30),
"rn": "30",
"gsm": gsm,
str(time_stamp): ""
}
url = base_url + urlencode(params)
response = requests.get(url, headers=headers, timeout=5).text
second_response = response.replace('\\', '\\\\')
json_response = json.loads(second_response)
photo_urls_collections = json_response["data"]
for i in range(len(photo_urls_collections)-1):
print(photo_urls_collections[i]["hoverURL"])
get_photo(photo_urls_collections[i]["hoverURL"], pn, i)
def main():
base_url = "https://image.baidu.com/search/acjson?"
keyword = input("请输入您想要爬取的图片关键字:")
print("------------------------------------------------------------------------------")
photo_page = int(input("请输入要爬取的页数:"))
gsm = hex(photo_page * 30)
for i in range(1, photo_page+1):
print("------------------------------------------------------------------------------")
print("---------------------------------开始爬取第{}页---------------------------------".format(i))
time_stamp = round(time.time() * 1000)
spider(base_url, i, time_stamp, gsm, keyword)
print("---------------------------------第{}页爬取完毕---------------------------------".format(i))
time.sleep(1)
print("爬取完毕, 主人!")
if __name__ == '__main__':
main()
演示