返回
import textwrap
import requests
from lxml import etree
import re
import time
import os
import json
class NovelSpider:
def __init__(self, novel_name=None, url=None):
self.url = url
self.novel_name = novel_name
self.data = None
self.other_content = ["速读谷", "【必应搜:书名+得奇小说-更新最快】",
"【必应搜:书名+速读谷-更新最快】"] # 需要剔除的文字(按行算)
self.delete_content = ["'", """] # 需要提出的文字(行内)
self.page_count = 0 # 设置你要看多少章节,0--无限
self.now_page = 0
self.novel_data = {}
def novel_url(self):
with open(file_path, "r", encoding="utf-8") as json_file:
loaded_data = json.load(json_file)
self.novel_data = loaded_data
if self.url is None:
# 如果没有传入url就查找书签
if self.novel_name is None:
raise ValueError("传入数值缺失")
if len(self.novel_data) == 0:
raise ValueError("无小说数据,需要传入一个url新增标签")
else:
if self.novel_name not in self.novel_data.keys():
raise ValueError("小说数据不存在")
else:
self.url = self.novel_data[self.novel_name]
else:
# 传入了url就按照url查找数据
pass
def read_html(self):
if self.page_count >= 0 and isinstance(self.page_count, int):
response = requests.get(self.url)
html = response.text
match = re.search(r'(?<=<span><a href=")(/\d+/\d+-\d+\.html)(?=">下一页</a>)', html)
if match:
next_page_url = match.group(1) # 提取
self.url = "https://www.sudugu.com" + next_page_url
else:
match = re.search(r'(?<=<span><a href=")(/\d+/\d+\.html)(?=">下一章</a>)', html)
if match:
next_page_url = match.group(1) # 提取
self.url = "https://www.sudugu.com" + next_page_url
else:
raise ValueError("小说看完啦~或者页面结构变啦~")
data = etree.HTML(html)
self.data = data
# 发现下一章或下一页时就写入新的链接
# 没有下一章就保存当前阅读章的链接
novel_name = self.data.xpath("//div/div/h1/a")
self.novel_data[novel_name[0].xpath("string(.)")] = self.url
with open(file_path, "w", encoding="utf-8") as json_file:
json.dump(self.novel_data, json_file, ensure_ascii=False, indent=4)
self.content()
self.next_page()
else:
raise ValueError("章节数输入错误")
def content(self):
p_list = self.data.xpath('//div/p')
for p in p_list:
text = p.xpath("string(.)")
for delete_date in self.delete_content:
text = text.replace(delete_date, '"')
wrapped_message = textwrap.fill(text, width=50) # 每行最多 50 字符
if wrapped_message in self.other_content:
pass # 跳过不想展示的行
else:
# 这两行代码会显示详细的格式,比如/n之类的
# pp = pprint.PrettyPrinter(width=50)
# pp.pprint(wrapped_message)
print("\t" + wrapped_message)
time.sleep(0.3)
def next_page(self):
if "-" not in self.url:
# print(f"下一章的路径是{self.url}")
if self.page_count == 0:
os.system("pause") # 触发下一章时暂停
elif self.page_count >= 1:
self.now_page += 1
if self.now_page < self.page_count:
print(f"剩下章节数:{self.page_count - self.now_page}")
time.sleep(3) # 停顿一下
elif self.now_page == self.page_count:
raise IndexError("当前批次结束")
else:
# 这里处理小章节的翻页
pass
self.read_html()
def main(name=None, url=None):
# 定义文件路径
global file_path
file_path = r"./novelData.json"
# 创建目录
# os.makedirs(os.path.dirname(file_path), exist_ok=True)
# 检查文件
if not os.path.exists(file_path):
initial_data = {} # 默认值
with open(file_path, "w", encoding="utf-8") as json_file:
json.dump(initial_data, json_file, ensure_ascii=False, indent=4)
json_file.flush() # 避免暂停时候的缓存影响
os.fsync(json_file.fileno()) # 写入磁盘,但可能不会立即写入 会有延迟
novel_spider = NovelSpider(novel_name=name, url=url)
try:
novel_spider.novel_url()
novel_spider.read_html()
except IndexError as e:
print(f"{e}")
except ValueError as e:
print(f"{e}")
if __name__ == '__main__':
# 传入链接会优先读取链接,传入name会读取书签中的历史记录
main(name="诡秘之主", url=None)