1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| import requests import openpyxl import re
book = openpyxl.load_workbook('天道图书馆.xlsx') sheet = book.active
def book(url): body_url = url content = requests.get(body_url)
content_url = re.findall('<div id="content">(.*?)</div>', content.text) content_body = content_url[0].replace(' ', '\t').replace('<br/>', '\n') title_url = re.findall('<h1>(.*?)</h1>', content.text)[0] content_title = f'\n\n{title_url}\n\n'
with open(file='天道图书馆.txt', mode='a', encoding='GBK') as file1: file1.write(content_title) file1.write(content_body)
num = 2 count = 1 while True: mulu_name = sheet.cell(row=num, column=1).value mulu_url = sheet.cell(row=num, column=2).value if mulu_name is not None: num += 1 print(mulu_name+mulu_url) book(mulu_url) count += 1 print(f'第{count}章,爬取完成~~~')
else: print('读取完成!') break
|