【python 新浪微博爬虫】python 爬取新浪微博热门话题

标签: python  爬虫  新浪微博  热门话题

**

【python 新浪微博爬虫】python 爬取新浪微博热门话题

**

最近在学习爬虫,根据这篇博文(https://blog.csdn.net/u013421629/article/details/72679369?utm_source=itdadao&utm_medium=referral)码了新浪微博热门话题爬虫,代码如下:

代码块

@requires_authorization
# encoding: utf-8
import sys
import requests
import json
import base64
import re
import time
import pandas as pd

##################定义数据结构列表存储数据
top_name = []
top_reading = []
top_rank=[]
top_subtitle=[]
top_fans = []
host_name = []
host_follow = []
host_fans = []
host_weibo = []

###########模拟登录新浪
def login(username, password):
    username = base64.b64encode(username.encode('utf-8')).decode('utf-8')
    postData = {
        "entry": "sso",
        "gateway": "1",
        "from": "null",
        "savestate": "30",
        "useticket": "0",
        "pagerefer": "",
        "vsnf": "1",
        "su": username,
        "service": "sso",
        "sp": password,
        "sr": "1440*900",
        "encoding": "UTF-8",
        "cdult": "3",
        "domain": "sina.com.cn",
        "prelt": "0",
        "returntype": "TEXT",
    }
    loginURL = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
    session = requests.Session()
    res = session.post(loginURL, data = postData)
    jsonStr = res.content.decode('gbk')
    info = json.loads(jsonStr)
    if info["retcode"] == "0":
        print(U"登录成功")
        # 把cookies添加到headers中,必须写这一步,否则后面调用API失败
        cookies = session.cookies.get_dict()
        cookies = [key + "=" + value for key, value in cookies.items()]
        cookies = "; ".join(cookies)
        session.headers["cookie"] = cookies
    else:
        print(U"登录失败,原因: %s" % info["reason"])
    return session


def analysis(topic):
    ###print(topic)

    ###找到排名###
    topicrank = re.search('<span class="(?:DSC_topicon_red|DSC_topicon|DSC_topicon_orange)">(.*?)</span>', topic, re.S)
    if topicrank is None:
        top_rank.append('')
    else:
        top_rank.append(topicrank.group(1))

    ###找到标题名##
    topicname= re.search('alt="(.*?)" class="pic">', topic, re.S)
    if topicname is None:
        top_name.append('')
    else:
        top_name.append(topicname.group(1))

    ###找到子标题名##
    subtitle = re.search('class="subtitle">(.*?)</div>', topic, re.S)
    if subtitle is None:
        top_subtitle.append('')
    else:
        top_subtitle.append(subtitle.group(1))

    ###找到阅读数
    readingcount = re.search('<span class="number">(.*?) </span>',topic, re.S)
    if readingcount is None:
        top_reading.append('')
    else:
        top_reading.append(readingcount.group(1))

    ###找到主持人
    ppname=re.search('class="tlink S_txt1"[\s]+>(.*?)</a></div>',topic,re.S)
    if ppname is None:
        host_name.append('')
        host_follow.append('')
        host_fans.append('')
        host_weibo.append('')

    else:
        host_name.append(ppname.group(1))
        aboutzcr = re.search('主持人:<span><a target="_blank" href="[^0-9]+(.*?)\?', topic, re.S)
        if aboutzcr is not None:
            pp1 = "http://m.weibo.cn/api/container/getIndex?type=uid&value=" + str(aboutzcr.group(1))
            html3 = session.get(pp1).content.decode("utf-8")
            jsoncontent = json.loads(html3)
            userInfo = jsoncontent['data']['userInfo']
            statuses_count = userInfo['statuses_count']
            followers_count = userInfo['followers_count']
            follow_count = userInfo['follow_count']
            host_follow.append(follow_count)
            host_fans.append(followers_count)
            host_weibo.append(statuses_count)
        else:
            host_follow.append('')
            host_fans.append('')
            host_weibo.append('')

    return


def savetoexcel():
    print(len(top_name), len(top_rank), len(top_subtitle), len(top_reading), len(host_name),len(host_follow),len(host_fans),len(host_weibo))

    count=top_name.__len__()
    print(count)
    dfl = pd.DataFrame(data={'top_name': top_name[0:count], 'top_rank': top_rank[0:count], 'top_subtitle': top_subtitle[0:count],
                         'top_reading': top_reading[0:count], 'host_name': host_name[0:count],'host_follow':host_follow[0:count],'host_fan':host_fans[0:count],'host_weibpo':host_weibo[0:count]})

    # 写至excel
    writer = pd.ExcelWriter(r'D:\\PycharmProjects\\WBSpider\\sina_weibo_topic50024.xlsx', engine='xlsxwriter',
                            options={'strings_to_urls': False})
    dfl.to_excel(writer, columns=['top_name','top_rank','top_subtitle','top_reading','host_name','host_follow','host_fan','host_weibpo'],index=False)
    writer.close()

    ###print(dfl)
    return



session = login('新浪微博账号', '账号密码')

#########################开始循环抓取
for i in range(1,8):
    try:
        ###print("正在抓取第"+str(i)+"页。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。")
        ###1小时热门
        ###url="https://d.weibo.com/100803?cfs=920&Pl_Discover_Pt6Rank__5_filter=&Pl_Discover_Pt6Rank__5_page="+str(i)+"#Pl_Discover_Pt6Rank__5"
        ###24小时热门
        url="https://d.weibo.com/100803?cfs=920&Pl_Discover_Pt6Rank__5_filter=hothtlist_type&Pl_Discover_Pt6Rank__5_page="+str(i)+"#Pl_Discover_Pt6Rank__5"
        html=session.get(url).content.decode("utf-8")

        ###剔除换行井号###
        handlepage=str(html).replace('\\t', "").replace('\\n', '').replace('\\', '').replace('#', '')

        ###########正则表达式匹配#######################
        topic=handlepage.split("pt_li S_line2")
        ###将网页头剔除
        topic.pop(0)
        ###print("本页话题数:"+str(topic.__len__()))
        for each in topic:
            analysis(each)

        ###睡眠防封号
        time.sleep(0.5)

    except:
        pass

savetoexcel()

结果(18.8.31新浪微博24小时热门话题):
这里写图片描述


原文链接:加载失败,请重新获取