import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safa-ri/537.36'}
url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd= 阿里巴巴'
res = requests.get(url, headers=headers).text
# 正则提取
p_href = '<h3 class="c-title">.*?<a href="(.*?)"'
p_title = '<h3 class="c-title">.*?>(.*?)</a>'
p_info = '<p class="c-author">(.*?)</p>'
href = re.findall(p_href, res, re.S)
title = re.findall(p_title, res, re.S)
info = re.findall(p_info, res, re.S)
# 数据清洗及打印输出
source = []
date = []
for i in range(len(title)):
title[i] = title[i].strip()
title[i] = re.sub('<.*?>', '', title[i])
info[i] = re.sub('<.*?>', '', info[i])
source.append(info[i].split(' ')[0])
date.append(info[i].split(' ')[1])
source[i] = source[i].strip()
date[i] = date[i].strip()
print(str(i+1) + '.' + title[i] + '(' + date[i] + '-' + source[i] + ')')
print(href[i])