网络爬虫之网页数据解析(正则re)

标签: 爬虫  网络  python  正则表达式  html  https

  • 小实例
    s = 'hello world python high salary 123 456 Hello 789.' \
        ' precious things are very few in the world,' \
        'that is the reason there is only one you!'
    # pattern = re.compile(r'[A-Za-z]+')
    pattern = re.compile(r'[\s.,\n!]')
    # result = re.findall(pattern,s)
    result = re.split(pattern,s)
    print(result)
    
  • 正则匹配规则

在这里插入图片描述

  • match方法:从起始位置开始查找,一次匹配
    import re
    
    pattern = re.compile(r'\d+') #用于匹配至少一个数字
    
    m = pattern.match('11asd55qwert88uio00')
    print(m,m.group())
    
    m = pattern.match('rtyu45dfcvbnm08ertyuijknb77',4,20)
    print(m,m.group())
    
    pattern = re.compile(r'\w+') #单词字符
    m = pattern.match('DS25 DW DR DT')
    print(m)
    
    pattern = re.compile('[a-z]+\s[a-z]+\s[a-z]+', re.I)  # re.I 表示忽略大小写
    m = pattern.match('H World Wide Web')
    print(m,m.group())
    
  • search方法:从任何位置开始查找,一次匹配
    import re
    pattern = re.compile('\d+')
    m = pattern.search('one12twothree34four')  # 这里如果使用 match 方法则不匹配
    print(m)
    
  • findall方法:全部匹配,返回列表
    import re
    pattern = re.compile(r'\d+')   # 查找数字
    result = pattern.findall('hello 123456 789')
    print(result)
    
    pattern = re.compile(r'\d+\.\d*')
    result = pattern.findall("123.141593, 'bigcat', 232312, 3.15, 3.")
    for item in result:
        print(item)
    
  • finditer方法:全部匹配,返回迭代器
    import re
    pattern = re.compile(r'\d+')
    
    result_iter= pattern.finditer('hello 123456 789')
    
    for iter in result_iter:   # m1 是 Match 对象
        print('matching string: {}, position: {}'.format(iter.group(), iter.span()))
    
  • split方法:分割字符串,返回列表
    import re
    p = re.compile(r'[\s\,\;]+')
    m = p.split('a,b;; c   d')
    
    print(m)
    
  • sub方法:替换
    import re
    p = re.compile(r'(\w+) (\w+)') 
    s = 'hello 123, hello 456, world 555,     hell 999'
    
    print(p.sub(r'hello world', s))  # 使用 'hello world' 替换 'hello 123' 和 'hello 456'
    print(p.sub(r'\2 \1', s))        # 引用分组
    
    def func(m):
        return 'hi' + ' ' + m.group(2)
    print(p.sub(func, s))
    print(p.sub(func, s, 1))
    
  • 匹配中文:re.compile(u’[\u4e00-\u9fa5]+’)
    • u/U:表示unicode字符串
    • r/R:非转义的原始字符串
    • b前缀代表的就是bytes
    #coding=utf8
    import re
    
    title = u'你好,hello,世界,天安门,愿望'
    pattern = re.compile(u'[\u4e00-\u9fa5]+')
    result = pattern.findall(title)
    
    print(result)
    
  • 贪婪模式与非贪婪模式
    import re
    str = 'aa<div>test1</div>bb<div>test2</div>cc'
    
    p = re.compile(r'<div>(.*?)</div>')
    
    m = p.search(str)
    print(m,m.group())
    
  • 正则案例
    • 批量爬取图片
      import requests
      import re
      url1 = 'http://sc.chinaz.com/tupian/index.html'
      url = 'http://sc.chinaz.com/tupian/index_%d.html'
      num = 0
      def download_images(img_urls):
          global num
          for img_url in img_urls:
              response = requests.get(img_url)
              filename = img_url.rsplit('/',1)[-1]
              with open('./pictures/%s'%(filename),mode = 'wb') as fp:
                  fp.write(response.content)
                  print('------------图片:%s保存成功-----------'%(filename))
                  num += 1
          return num
      
      if __name__ == '__main__':
          # response = requests.get(url1)
          # response.encoding = 'utf-8'
          # with open('./picture.html',mode='w',encoding='utf-8') as fp:
          #     fp.write(response.text)
          #     print('------------数据保存成功')
          for i in range(1,11):
              if i == 1:
                  url_pic = url1
              else:
                  url_pic = url%(i)
              response = requests.get(url_pic)
              response.encoding = 'utf-8'
              content = response.text
              '''<img src2="http://pic2.sc.chinaz.com/Files/pic/pic9/201910/bpic14126_s.jpg"'''
              img_urls = re.findall(r'<img src2="(.*?)"',content)
              # 单独下载图片的方法
              number = download_images(img_urls)
          print('共计下载图片多少张%d'%(number))
      
    • 多线程爬取图片
      import re
      import requests
      import threading
      url1 = 'http://sc.chinaz.com/tupian/index.html'
      url = 'http://sc.chinaz.com/tupian/index_%d.html'
      
      
      def download_image(img_url):
          response = requests.get(img_url)
          filename = img_url.rsplit('/',1)[-1]
          with open('./pictures/%s'%(filename),mode = 'wb') as fp:
              fp.write(response.content)
              print('-------图片%s保存成功--------'%(filename))
      def get_image_urls(num):
          for i in range(1,num + 1):
              if i == 1:
                  url_pic = url1
              else:
                  url_pic = url%(i)
              print('-------开始下载第%d页图片--------'%(i))
              response = requests.get(url_pic)
              response.encoding = 'utf-8'
              img_urls = re.findall(r'<img src2="(.*?)"', response.text)
              for img_url in img_urls:
                  t = threading.Thread(target = download_image,args = (img_url,))
                  t.start()
      if __name__ == '__main__':
          try:
              num = int(input('请输入获取的页码数量:'))
          except:
              print('请输入数字!')
              num = int(input('请输入获取的页码数量:'))
          get_image_urls(num)
      
    • 西刺代理
    import re
    import requests
    import time
    import random
    import threading
    
    url = 'https://www.xicidaili.com/nn/%d'
    def get_proxies(proxies):
        host,port,protocol = random.choice(proxies)
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWMwMTk0MjI3Y2U0YzNlMzAxYTE2OTNhNzNjYWE5MjY4BjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUQ2MFgwNjRkMW1TeWU4aW5Rc0ZFRUJTUWcySFQ5SkVESW4vNDFBM0o5YVk9BjsARg%3D%3D--4f5347e38cc48fa105784ff3eb74da208c89e3dc; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1572194359,1572248969,1572272353,1572320920; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1572320946',
            'Host': 'www.xicidaili.com',
            'If-None-Match': 'W/"3caa2430052219a3e8d311f50f38de44"',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', }
        fp = open('./proxies.txt', mode='a', encoding='utf-8')
        for i in range(10, 20):
            response = requests.get(url=url % (i),
                                    headers=headers,
                                    proxies = {'https':'https://455098435:[email protected]:16816'})
            response.encoding = 'utf-8'
            html = response.text
            # with open('./xici.html',mode = 'w',encoding='utf-8') as fp:
            #     fp.write(html)
            result = re.findall(r'<tr.*?>(.*?)</tr>', html, flags=re.S)
            '''<tr class="odd">
          <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
          <td>182.35.80.136</td>
          <td>9999</td>
          <td>
            <a href="/2019-10-29/shandong">山东泰安</a>
          </td>
          <td class="country">高匿</td>
          <td>HTTP</td>
          <td class="country">
            <div title="0.401秒" class="bar">
              <div class="bar_inner fast" style="width:88%">
                
              </div>
            </div>
          </td>
          <td class="country">
            <div title="0.08秒" class="bar">
              <div class="bar_inner fast" style="width:98%">
                
              </div>
            </div>
          </td>
          
          <td>1分钟</td>
          <td>19-10-29 13:20</td>
        </tr>'''
            print('----------------',len(result))
            for item in result[1:]:
                try:
                    ip = re.findall(r'<td>([\d\.]*)</td>', item, re.S)
                    type = re.findall(r'<td>([A-Z]+)</td>', item, re.S)
                    fp.write('%s,%s,%s\n' % (ip[0], ip[1], type[0]))
                except Exception as e:
                    with open('./log.txt',mode = 'a',encoding='utf-8') as f:
                        f.write(item + '\n' + str(e) + '\n')
            print('第%d页代理爬取成功!' % (i))
            time.sleep(random.randint(1, 3))
        fp.close()
    num = 0
    fp = open('./proxies.txt','r',encoding='utf-8')
    fp2 = open('./verified_proxie.txt','a',encoding='utf-8')
    
    def verify_proxy():
        global num
        while True:
            line = fp.readline().strip('\n')
            if line != '':
                try:
                    ip,host,protocol = line.split(',')
                except:
                    print('------------------------------',line)
                # 要访问的网站,如果是https,那么代理也要是https,不对应不走代理,走本地
                # 要访问的网站,如果是http,那么代理也要是http类型
                url1 = 'http://ip.tool.chinaz.com/'
                url2 = 'https://ip.cn/'
                if protocol == 'HTTPS':
                    try:
                        requests.get(url2,proxies = {'https':'%s:%s'%(ip,host)},timeout = 3)
                        print('该ip:%s:%s验证通过'%(ip,host))
                        fp2.write('%s,%s,%s\n'%(ip,host,protocol))
                        num +=1
                    except Exception as e:
                        print('该ip:%s:%s验证失败' % (ip, host))
                else:
                    try:
                        requests.get(url1, proxies={'http': '%s:%s' % (ip, host)}, timeout=3)
                        print('该ip:%s:%s验证通过' % (ip, host))
                        fp2.write('%s,%s,%s\n' % (ip, host, protocol))
                        num +=1
                    except Exception as e:
                        print('该ip:%s:%s验证失败' % (ip, host))
            else:
                break
        return num
    
    if __name__ == '__main__':
        with open('./verified_proxie.txt',mode = 'r',encoding='utf-8') as f:
            proxies = f.readlines()
        proxies = [proxy.strip('\n').split(',') for proxy in proxies]
        print(proxies)
        get_proxies(proxies)
        # threads = []
        # for i in range(1000):
        #     t = threading.Thread(target=verify_proxy)
        #     t.start()
        #     threads.append(t)
        # # join必须单独写,目的:线程启动
        # for t in threads:
        #     t.join()
        # print('-----------------所有的子线程结束任务,主线程开始执行')
        # fp.close()
        # fp2.close()
    
    • 正则表达式测试网站

      [https://tool.oschina.net/regex/]:

nge(1000):
# t = threading.Thread(target=verify_proxy)
# t.start()
# threads.append(t)
# # join必须单独写,目的:线程启动
# for t in threads:
# t.join()
# print(’-----------------所有的子线程结束任务,主线程开始执行’)
# fp.close()
# fp2.close()




- ##### 正则表达式测试网站

  [https://tool.oschina.net/regex/]: 

  
版权声明:本文为qq_42546127原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/qq_42546127/article/details/106385530