【网络爬虫实战】猫眼电影Top100

抓取首页:http://maoyan.com/board/4?offset=0

代码:

# -*- coding:utf-8 -*-
import requests
from requests.exceptions import RequestException
import re   # 正则需要的包
import json  # json.dumps需要的包
from multiprocessing import Pool #多线程
import time
import  os # os.path模块

'''
按页面抓取网页html内容并返回
'''
def get_one_page(url):
    # 伪装浏览器请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    }
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

'''
正则表达式匹配
'''
def parse_one_page(html):
    # 正则匹配的模式 分析html得到
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name">'
                         + '<a.*?>(.*?)</a>.*?"star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    # 参数中的re.S表示匹配任意字符(包括换行符)
    items = re.findall(pattern, html)
    # 封装为字典
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5]+item[6]
        }
    # print(items)

'''
保存电影封面图片
'''
def save_image_file(url, path):
    imageResponse = requests.get(url)
    if imageResponse.status_code == 200:
        with open(path, 'wb') as f:
            f.write(imageResponse.content)
            f.close()

'''
功能:将电影信息保存至文件

json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False
'''
def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()

def main(offset):
    url = "http://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    # 若不存在封面文件夹就新建
    if not os.path.exists('covers'):  # os.path用法见:https://www.cnblogs.com/wuxie1989/p/5623435.html
        os.mkdir('covers')

    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
        save_image_file(item['image'], 'covers/' + '%03d'%int(item['index']) + item['title'] + '.jpg') # 构造的存储电影图片名称
    # parse_one_page(html)
    # print(html)

if __name__ == '__main__':
    time_start = time.time()
    # for i in range(10):
    #     main(i*10)  # 由翻页的规律得到
    pool = Pool()   # 创建进程池
    pool.map(main, [i*10 for i in range(10)])   #pool.map() 用法与普通map相同
    time_end = time.time()
    time_cost = time_end - time_start  # 抓取耗时
    print(time_cost)

页面规律:

http://maoyan.com/board/4?offset=0

http://maoyan.com/board/4?offset=10

http://maoyan.com/board/4?offset=20

……

html文本:

不使用线程池耗时:

 

使用线程池耗时:

抓取到的信息如下:

 

参考资料:

崔庆才 Pyhon3爬虫

参考博客:

https://www.jianshu.com/p/8fa68aee0581

 

原文链接:加载失败,请重新获取