网络爬虫之解析网页BeautifulSoup

BeautifulSoup的基本用法

安装bs4 和 lxml
bs4里面含有 BeautifulSoup

from  bs4 import BeautifulSoup

把官网的列子复制过来做案例

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

需要实列化BeautifulSoup对象

#需要实列化BeautifulSoup对象
soup = BeautifulSoup(html_doc, "lxml")

格式化输出(把HTML按照严格的缩进格式输出)

soup = BeautifulSoup(html_doc, "lxml")
 格式化输出(把HTML按照严格的缩进格式输出)
# print(soup)
# print(soup.prettify())

获取标签

#获取标签
# tag = soup.title
# print(tag)    <title>The Dormouse's story</title>

只拿到标签名

只拿到标签名
# name = tag.name
# print(name)  title

拿到正文的内容

拿到正文的内容
# str = tag.string
# print(str)    The Dormouse's story

在有多个相同的值的时候拿到第一条

在有多个相同的值的时候拿到第一条
tag = soup.p
# print(tag)  <p class="title"><b>The Dormouse's story</b></p>(重上往下)

找所有p

找所有p
tags = soup.find_all("p")
# print(len(tags))  3

指定所定的属性值 格式 属性名

# 指定所定的属性值  格式 属性名:属性值  attrs={"class": "story"}
tags = soup.find_all("p", attrs={"class": "story"})
# print(len(tags))  2

精确查找

# 精确查找 find 注: 此时要是有class 必定要改成class_ 因为与关键词重名了
tag = soup.find(class_="title")
# print(tag)   <p class="title"><b>The Dormouse's story</b></p>

查找它的上一节点

查找它的上一节点
tag = soup.title
# print(tag.parent)   <head><title>The Dormouse's story</title></head>

查找它的上一节点的标签名

# 查找它的上一节点的标签名
# print(tag.parent.name)

父节点的父节点

父亲的父亲
# print(tag.parent.parent.name)
# print(tag.parent.parent.parent.name)  [document]  最外面的是[document]所有类的父类

根据下标来 拿标签的值

# 根据下标来 拿标签的值
tag = soup.p
# ① 根据class 拿 标签值
str = tag.get("class")
# print(str)  ['title'] 数组表示有多个值


# ②
str = soup.p["class"]
# print(str)  ['title']
str = soup.a.get("id")
# print(str)  link1  id只有一个没有数组  默认第一个

获取a标签的所有属性值

# 获取a标签的所有属性值
strs = soup.a.attrs
# print(strs) {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'} 以键值对的方式打印

三种拿去方式
1:

# 向下获取
# tag = soup.body
# print(tag)   <title>The Dormouse's story</title>
# print(tag.contents)   ["The Dormouse's story"]  获取下一级
# print(tag.children)    <list_iterator object at 0x0000022681D54088> 地址(遍历)获取下一级
# for c in tag.children:
#     print(c)  The Dormouse's story

# print(tag.descendants)   子孙节点
# for c in tag.descendants:
#     print(c)   #查询到所有子孙节点   空格逗号都是
#     print("*"*20)

2:

# 从下往上找上一节点
# print(soup.title.parent.name)  head

# print(soup.title.parents.name)
# 向上查找所有父节点
# for p in soup.title.parents:
#     print(p.name)
# head
# html
# [document]

3:


# 平行拿取
tag = soup.a
# print(tag.next_sibling.next_sibling.name)   a

BeautifulSoup综合案例:

爬取“猫眼电影的排行榜”

"""
BeautifulSoup综合案例:爬取“猫眼电影的排行榜”
"""

import requests
from bs4 import BeautifulSoup
import os

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}

response = requests.get("https://maoyan.com/board/4", headers=headers)
if response.status_code == 200:
    print(response.text)

# 解析网页
soup = BeautifulSoup(response.text, "lxml")
imgTag = soup.find_all("img", attrs={"class": "board-img"})


# 获取当前根目录
root = os.getcwd()

# 在根目录中创建文件夹"第一页"

os.mkdir("第一页")

# 改变但当前目录


for imgTags in imgTag:
    # print(imgTags)  注: 在这个时候可能打印的和原本的不一致 所有需要打印出来对印上
    name = imgTags.get("alt")
    src = imgTags.get("data-src")


    # 在次下载一次
    resp = requests.get(src,headers=headers)
    # 保存
    with open(f"{name}.png", "wb") as f:
        f.write(resp.content)
    print(f"{name}{src} 保存成功")

在这里插入图片描述

爬取"最好大学网"排行

"""
BeautifulSoup综合案例
爬取"最好大学网"排行
"""
import requests
from bs4 import BeautifulSoup
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}


response = requests.get("http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html", headers=headers)
response.encoding = "utf-8"
if response.status_code == 200:
  soup = BeautifulSoup(response.text, "lxml")
  trTags = soup.find_all("tr", attrs={"class": "alt"})
  for trTag in trTags:
      id = trTag.contents[0].string
      name = trTag.contents[1].string
      addr = trTag.contents[2].string
      sro = trTag.contents[3].string
      print(f"{id} {name} {addr} {sro}")


在这里插入图片描述

原文链接:加载失败,请重新获取