python爬虫之Request,BeautifulSoup进阶

标签: Python  入门  爬虫

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re


res = requests.get('http://news.sina.com.cn/o/2017-12-01/doc-ifyphtze3020095.shtml')
res.encoding = 'utf-8'

#打印文章标题
soup = BeautifulSoup(res.text , 'html.parser')
print(soup.select('#artibodyTitle')[0].text)

#获取文章发表时间
timesource = soup.select('#navtimeSource')[0].contents[0].strip()

#获取文章来源
#wsource = soup.select('#navtimeSource')[0].contents[1].text.strip()
wsource = soup.select('#navtimeSource span a')[0].text

#获取文章来源链接 
wa = soup.select('#navtimeSource span a')[0].get('href')

#时字符串转时间格式
dt = datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
#时间格式转字符串  
#dtstr = dt.strftime('%Y-%m-%d %H:%M:%S')  
#将时间,来源,链接依次打印  
print(dt,wsource,wa)


#取得文章内容
#方式1  
'''
article = []
for p in soup.select('#artibody p')[:-1]:
	article.append(p.text)

print(' '.join(article))
'''
#方式2  
print(' '.join(p.text.strip() for p in soup.select('#artibody p')[:-1]))


#选取新闻编辑者名称
print(soup.select('.article-editor')[0].text)
#print(soup.select('.article-editor')[0].text.lstrip('责任编辑:'))


#剖析新闻标识符
#方法1
newsurl = 'http://news.sina.com.cn/o/2017-12-01/doc-ifyphtze3020095.shtml'
newsid = newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
print('newsid =',newsid)
'''
#方法2
m = re.search('doc-i(.+).shtml',newsurl)
print(m.group(0))
print(m.group(1))
'''

结果展示:


版权声明:本文为Shuaiyiping原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/Shuaiyiping/article/details/78718837