1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
| import sys, urllib, urllib.request as requests, re, codecs, html from bs4 import BeautifulSoup from ConfigData import ConfigData as cd from Blog import Blog
def get_BlogDetail(strUrl):
request = requests.Request(strUrl) response = requests.urlopen(request) html = response.read() soup = BeautifulSoup(html, 'html.parser')
strFindList = str(soup.find_all('div', cd.patternTagDiv)) tagList = re.findall(cd.patternTag, strFindList) if tagList != None and len(tagList) > 0: tag = (str(re.findall(cd.patternTag, strFindList)[0])[6:-2]).split(",") else: tag = [] categoryList = re.findall(cd.patternCategory, strFindList) if categoryList != None and len(categoryList) > 0: category = (str(re.findall(cd.patternCategory, strFindList)[0])[1:-3]).split(",") else: category = []
strFindList = str(soup.find('title')).strip() title = strFindList[7:-8].split("_")[0]
strFindList = str(soup.find_all('span', cd.patternTitleDetail)) publish = str(re.findall(cd.patternPublishDetail, strFindList)[0])
comment, read = get_cr_num(strUrl, get_crDict("&aids=" + strUrl[-11:-5]))
content = str(soup.find('div', cd.patternContent ))
strFindList = soup.find_all('img', title=re.compile(title)) picAddressList = [] for strFind in strFindList: picAddress = str(re.findall(r'real_src=.+src=', str(strFind)))[12:-6].strip()[:-1] picAddress = picAddress.replace('&', '&') picAddressList.append(picAddress) return Blog(title, strUrl, publish, comment, read, picAddressList, content, tag, category)
def get_crDict(param): url = cd.crUrl + param request = requests.Request(url) response = requests.urlopen(request) pattern = cd.patternCR htmlW = str(re.findall(pattern, str(response.read()).strip())[0][:-1]) return e v a l(htmlW)
def get_cr_num(strUrl, cr_dict): needFindKey = strUrl[-11:-5] return cr_dict[needFindKey].get('c'), cr_dict[needFindKey].get('r') def makeBlogList():
needProcessURL = set()
starUrlList = [] for i in range(1,5): starUrlList.append(cd.strUrl + str(i) + ".html")
fout = codecs.open('sina.html', 'w', encoding='utf-8') fout.write(cd.htmlStart) i=1 blogList = [] for url in starUrlList: request = requests.Request(url, headers=cd.headers) response = requests.urlopen(request) html = response.read() soup = BeautifulSoup(html, 'html.parser')
strFindList = soup.find_all('div', cd.patternDiv) paramCR = '&aids=' for strFind in strFindList: strFind = str(strFind) resUrl = re.findall(cd.patternUrl, strFind)[0] blogList.append(get_BlogDetail(str(resUrl))) for blog in blogList: hasPic = cd.outPic if len(blog.picAddress) > 0 else '' fout.write(cd.htmlCont.format(str(i), blog.title, blog.url, blog.url, hasPic, blog.comment, blog.read, blog.publish)) i += 1 fout.write(cd.htmlEnd) fout.close() if __name__ == '__main__': makeBlogList()
|