爬取新浪博客文章的Python实现之②

发表于 2020-09-01 分类于程序人生阅读次数：本文字数： 4.1k 阅读时长 ≈ 4 分钟

Blog.py

class Blog(object):
def __init__(self, title, url, publish, comment, read, picAddress, content, tag, category):
self.title = title                #主题
self.url = url                    #URL
self.publish = publish            #发表日期
self.comment = comment            #评论数
self.read = read                  #阅读数
self.picAddress = picAddress      #图片地址列表，无的话为空
self.content = content            #内容
self.tag = tag                    #标签列表，无的话为空
self.category = category          #类别
def show(self):
print("title: ", self.title)
print("url:", self.url)
print("comment", self.comment)
print("read", self.read)
print("picAddress", self.picAddress)
print("tag", self.tag)
print("category", self.category)
print("content", self.content)

getBlogofSina.py

import sys, urllib, urllib.request as requests, re, codecs, html
from bs4 import BeautifulSoup
from ConfigData import ConfigData as cd
from Blog import Blog
# 取得博文详细信息
def get_BlogDetail(strUrl):
#retBlog = Blog(title, strUrl, publish, comment, read, picAddress, content, tag, category)
request = requests.Request(strUrl)
response = requests.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
# 查找包含TAG和CATEGORY的段落
strFindList = str(soup.find_all('div', cd.patternTagDiv))
tagList = re.findall(cd.patternTag, strFindList)
if tagList != None and len(tagList) > 0:
tag = (str(re.findall(cd.patternTag, strFindList)[0])[6:-2]).split(",")
else:
tag = []
categoryList = re.findall(cd.patternCategory, strFindList)
if categoryList != None and len(categoryList) > 0:
category = (str(re.findall(cd.patternCategory, strFindList)[0])[1:-3]).split(",")
else:
category = []
# 查找TITLE的段落
strFindList = str(soup.find('title')).strip()
title = strFindList[7:-8].split("_")[0]
# print(title)
# 查找发表时间的段落
strFindList = str(soup.find_all('span', cd.patternTitleDetail))
publish = str(re.findall(cd.patternPublishDetail, strFindList)[0])
# 用Python发送API，获取JS返回值的方式获取评论数和阅读数
comment, read = get_cr_num(strUrl, get_crDict("&aids=" + strUrl[-11:-5]))
# 获得博文内容
content = str(soup.find('div', cd.patternContent ))
# 获得图像地址列表
strFindList = soup.find_all('img', title=re.compile(title))
picAddressList = []
for strFind in strFindList:
picAddress = str(re.findall(r'real_src=.+src=', str(strFind)))[12:-6].strip()[:-1]
picAddress = picAddress.replace('&amp;', '&')
picAddressList.append(picAddress)
return Blog(title, strUrl, publish, comment, read, picAddressList, content, tag, category)
# 取得评论和阅读数的字典合集
# INPUT: param （&aids=02vdeu,0136nk）
# RETURN: r - 评论数； c - 阅读数
def get_crDict(param):
url = cd.crUrl + param
request = requests.Request(url)
response = requests.urlopen(request)
pattern = cd.patternCR
htmlW = str(re.findall(pattern, str(response.read()).strip())[0][:-1])
return e v a l(htmlW)    # 返回字典 把e v a l的空格都要去掉，这个编辑器没法提交，所以只能加空格
# 取得评论和阅读数
# INPUT: _url （http://blog.sina.com.cn/s/blog_5922f3300101e20o.htm）
# RETURN: r - 评论数； c - 阅读数
def get_cr_num(strUrl, cr_dict):
needFindKey = strUrl[-11:-5]
return cr_dict[needFindKey].get('c'), cr_dict[needFindKey].get('r')
def makeBlogList():
# 保存具体内容的待爬取URL清单
needProcessURL = set()
# 需要爬取的URL列表
starUrlList = []
for i in range(1,5):
starUrlList.append(cd.strUrl + str(i) + ".html")
# 生成博客文章一览表，格式：
# 序号 | 主题 | URL| 是否有图| 评论数/阅读数| 发表时间
fout = codecs.open('sina.html', 'w', encoding='utf-8')
fout.write(cd.htmlStart)
i=1    #  博客文章序号
blogList = []
for url in starUrlList:
request = requests.Request(url, headers=cd.headers)
response = requests.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
# strFindList = soup.find_all('a', href=re.compile(cd.patternALink))
strFindList = soup.find_all('div', cd.patternDiv)
paramCR = '&aids='
for strFind in strFindList:
strFind = str(strFind)
resUrl = re.findall(cd.patternUrl, strFind)[0]
blogList.append(get_BlogDetail(str(resUrl)))
for blog in blogList:
hasPic = cd.outPic if len(blog.picAddress) > 0 else ''
fout.write(cd.htmlCont.format(str(i), blog.title, blog.url, blog.url, hasPic, blog.comment, blog.read, blog.publish))
i += 1
fout.write(cd.htmlEnd)
fout.close()
if __name__ == '__main__':
makeBlogList()