16.爬虫技术

获取目录:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import requests
import re
import openpyxl

url = 'https://www.ddxs.cc/ddxs/661/'
book_html = requests.get(url)
book_html.encoding = 'gbk' # 解决乱码问题

book_name = re.findall('<h1>(.*?)</h1>', book_html.text)
mulu = re.findall('.html">(.*?)</a>', book_html.text)[1:]
mulu_num = re.findall('<a href="/ddxs/661/(.*?).html">', book_html.text)[1:]

mulu_url = []
for i in range(len(mulu)):
mulu_url.append(f'{url}{mulu_num[i]}.html')


# 写入Excel
mulu_excle = openpyxl.Workbook()
sheet = mulu_excle.active

sheet['A1'] = '标题'
sheet['B1'] = '目录'

number = 2
for i in range(len(mulu)):
sheet.cell(row=number, column=1).value = mulu[i]
sheet.cell(row=number, column=2).value = mulu_url[i]
number += 1

mulu_excle.save('天道图书馆.xlsx')


# Excel 读取
book = openpyxl.load_workbook('天道图书馆.xlsx')
sheet = book.active

number = 2
for i in range(len(mulu)):
mulu = sheet.cell(row=number, column=1)
mulu_url = sheet.cell(row=number, column=2)
number += 1
print(mulu.value+' '+mulu_url.value)

获取章节内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import requests
import openpyxl
import re


book = openpyxl.load_workbook('天道图书馆.xlsx')
sheet = book.active


def book(url):
body_url = url
content = requests.get(body_url)

content_url = re.findall('<div id="content">(.*?)</div>', content.text)
content_body = content_url[0].replace('&nbsp;&nbsp;&nbsp;&nbsp;', '\t').replace('<br/>', '\n')
title_url = re.findall('<h1>(.*?)</h1>', content.text)[0]
content_title = f'\n\n{title_url}\n\n'

with open(file='天道图书馆.txt', mode='a', encoding='GBK') as file1:
file1.write(content_title)
file1.write(content_body)


num = 2
count = 1
while True:
mulu_name = sheet.cell(row=num, column=1).value
mulu_url = sheet.cell(row=num, column=2).value
if mulu_name is not None:
num += 1
print(mulu_name+mulu_url)
book(mulu_url)
count += 1
print(f'第{count}章,爬取完成~~~')

else:
print('读取完成!')
break