python爬取27报动态图

更新时间:2020-05-20 10:23:00点击次数:404次
代码如下:

import requests
from pyquery import PyQuery as pq
import os, time
from urllib.request import urlretrieve


def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)


def getImgUrl():
headers = {
'Host': 'www.27baobao.com',
'User - Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/75.0.3770.142 Safari/537.36'
}
host = 'www.27baobao.com'
url = 'https://www.27baobao.com/quanwenyuedu/'
urls = []
for i in range(1, 7):
urls.append(url + 'list_{}.html'.format(i))
for i in urls:
res = requests.get(i, headers=headers)
res.encoding = 'uft-8'
# print(res.text)
content = pq(res.text)('#page ul li a:nth-child(2)')
# print(content)
imgUrls = []
names = []
for i in content.items():
url = i('a').attr('href').split('www')[1]
name = i('a').text().strip()
imgUrls.append('https://m' + url)
names.append(name)
return imgUrls, names


def nextPage(url):
headers = {
'Host': 'm.27baobao.com',
# 'Referer': 'https://m.27baobao.com/',
'User - Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/75.0.3770.142 Safari/537.36'
}
host = 'https://m.27baobao.com/'
res = requests.get(url, headers=headers)
# print(url)
category = url.split('/')[3].split('.')[0]
# print(category)
res.encoding = 'uft-8'
# print(res.text)
nextPageHerf = url.split('/')[-1].split('.')[0]
# print(nextPageHerf)
pages = pq(res.text)('.pagearti a:first-child').text()
# print(len(pages))
# print(pages)
if len(pages) == 5:
totalPage = pages[1:3]
else:
totalPage = pages[1:2]
nextPages = []
for i in range(2, int(totalPage)):
nextPages.append(host + category + '/' + nextPageHerf + '_' + str(i) + '.html')
return nextPages


def getImg(url, path):
headers = {
'Host': 'm.27baobao.com',
'Referer': 'https://m.27baobao.com/',
'User - Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/75.0.3770.142 Safari/537.36'
}
res = requests.get(url, headers=headers)
res.encoding = 'uft-8'
content = pq(res.text)('.content img').attr('src')
# print(content)
try:
if content:
# urlretrieve(content, 'C:\\Users\\Administrator\\Desktop\\tupian\\11\\{}'.format(content[-12:]))
urlretrieve(content, path + '\\{}'.format(content[-16:]))
print('{}下载完成'.format(content))
except:
pass


def main():
urls, category = getImgUrl()
for i, m in zip(urls, category):
# print(i, m)
path = 'F:\\下载\\tu\\{}'.format(m)
mkdir(path)
getImg(i, path)
nextPages = nextPage(i)
for j in nextPages:
getImg(j, path)
time.sleep(1)
time.sleep(1)


if __name__ == '__main__':
main()


本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责,本站只提供参考并不构成任何投资及应用建议。本站是一个个人学习交流的平台,网站上部分文章为转载,并不用于任何商业目的,我们已经尽可能的对作者和来源进行了通告,但是能力有限或疏忽,造成漏登,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

  • 项目经理 点击这里给我发消息
  • 项目经理 点击这里给我发消息
  • 项目经理 点击这里给我发消息
  • 项目经理 点击这里给我发消息