Python实战金融大数据

2015/04/25 Python

Python实战金融大数据

Python爬虫技术

import requests
import re
headers = {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safa-ri/537.36'}
url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd= 阿里巴巴'
res = requests.get(url, headers=headers).text

    # 正则提取
p_href = '<h3 class="c-title">.*?<a href="(.*?)"'
p_title = '<h3 class="c-title">.*?>(.*?)</a>'
p_info = '<p class="c-author">(.*?)</p>'
href = re.findall(p_href, res, re.S)
title = re.findall(p_title, res, re.S)
info = re.findall(p_info, res, re.S)

    # 数据清洗及打印输出
source = []
date = []
for i in range(len(title)):
    title[i] = title[i].strip()
    title[i] = re.sub('<.*?>', '', title[i])
    info[i] = re.sub('<.*?>', '', info[i])
    source.append(info[i].split('&nbsp;&nbsp;')[0])
    date.append(info[i].split('&nbsp;&nbsp;')[1])
    source[i] = source[i].strip()
    date[i] = date[i].strip()
    print(str(i+1) + '.' + title[i] + '(' + date[i] + '-' + source[i] + ')')
    print(href[i])

Search

    微信好友

    博士的沙漏

    Table of Contents