爬取新浪博客文章的Python实现之②

Blog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
class Blog(object):
def __init__(self, title, url, publish, comment, read, picAddress, content, tag, category):
self.title = title #主题
self.url = url #URL
self.publish = publish #发表日期
self.comment = comment #评论数
self.read = read #阅读数
self.picAddress = picAddress #图片地址列表,无的话为空
self.content = content #内容
self.tag = tag #标签列表,无的话为空
self.category = category #类别
def show(self):
print("title: ", self.title)
print("url:", self.url)
print("comment", self.comment)
print("read", self.read)
print("picAddress", self.picAddress)
print("tag", self.tag)
print("category", self.category)
print("content", self.content)
getBlogofSina.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import sys, urllib, urllib.request as requests, re, codecs, html
from bs4 import BeautifulSoup
from ConfigData import ConfigData as cd
from Blog import Blog
# 取得博文详细信息
def get_BlogDetail(strUrl):
#retBlog = Blog(title, strUrl, publish, comment, read, picAddress, content, tag, category)
request = requests.Request(strUrl)
response = requests.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
# 查找包含TAG和CATEGORY的段落
strFindList = str(soup.find_all('div', cd.patternTagDiv))
tagList = re.findall(cd.patternTag, strFindList)
if tagList != None and len(tagList) > 0:
tag = (str(re.findall(cd.patternTag, strFindList)[0])[6:-2]).split(",")
else:
tag = []
categoryList = re.findall(cd.patternCategory, strFindList)
if categoryList != None and len(categoryList) > 0:
category = (str(re.findall(cd.patternCategory, strFindList)[0])[1:-3]).split(",")
else:
category = []
# 查找TITLE的段落
strFindList = str(soup.find('title')).strip()
title = strFindList[7:-8].split("_")[0]
# print(title)
# 查找发表时间的段落
strFindList = str(soup.find_all('span', cd.patternTitleDetail))
publish = str(re.findall(cd.patternPublishDetail, strFindList)[0])
# 用Python发送API,获取JS返回值的方式获取评论数和阅读数
comment, read = get_cr_num(strUrl, get_crDict("&aids=" + strUrl[-11:-5]))
# 获得博文内容
content = str(soup.find('div', cd.patternContent ))
# 获得图像地址列表
strFindList = soup.find_all('img', title=re.compile(title))
picAddressList = []
for strFind in strFindList:
picAddress = str(re.findall(r'real_src=.+src=', str(strFind)))[12:-6].strip()[:-1]
picAddress = picAddress.replace('&', '&')
picAddressList.append(picAddress)
return Blog(title, strUrl, publish, comment, read, picAddressList, content, tag, category)
# 取得评论和阅读数的字典合集
# INPUT: param (&aids=02vdeu,0136nk)
# RETURN: r - 评论数; c - 阅读数
def get_crDict(param):
url = cd.crUrl + param
request = requests.Request(url)
response = requests.urlopen(request)
pattern = cd.patternCR
htmlW = str(re.findall(pattern, str(response.read()).strip())[0][:-1])
return e v a l(htmlW) # 返回字典 把e v a l的空格都要去掉,这个编辑器没法提交,所以只能加空格
# 取得评论和阅读数
# INPUT: _url (http://blog.sina.com.cn/s/blog_5922f3300101e20o.htm)
# RETURN: r - 评论数; c - 阅读数
def get_cr_num(strUrl, cr_dict):
needFindKey = strUrl[-11:-5]
return cr_dict[needFindKey].get('c'), cr_dict[needFindKey].get('r')
def makeBlogList():
# 保存具体内容的待爬取URL清单
needProcessURL = set()
# 需要爬取的URL列表
starUrlList = []
for i in range(1,5):
starUrlList.append(cd.strUrl + str(i) + ".html")
# 生成博客文章一览表,格式:
# 序号 | 主题 | URL| 是否有图| 评论数/阅读数| 发表时间
fout = codecs.open('sina.html', 'w', encoding='utf-8')
fout.write(cd.htmlStart)
i=1 # 博客文章序号
blogList = []
for url in starUrlList:
request = requests.Request(url, headers=cd.headers)
response = requests.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
# strFindList = soup.find_all('a', href=re.compile(cd.patternALink))
strFindList = soup.find_all('div', cd.patternDiv)
paramCR = '&aids='
for strFind in strFindList:
strFind = str(strFind)
resUrl = re.findall(cd.patternUrl, strFind)[0]
blogList.append(get_BlogDetail(str(resUrl)))
for blog in blogList:
hasPic = cd.outPic if len(blog.picAddress) > 0 else ''
fout.write(cd.htmlCont.format(str(i), blog.title, blog.url, blog.url, hasPic, blog.comment, blog.read, blog.publish))
i += 1
fout.write(cd.htmlEnd)
fout.close()
if __name__ == '__main__':
makeBlogList()