import scrapy
class XiaoshuoSpider(scrapy.Spider):
name = "xiaoshuo"
allowed_domains = ["zhhbqg.com"]
start_urls = ["https://www.bqgui.cc/book/108732/1.html"]
def parse(self, response):
# 章节名称
title = response.xpath('//h1/text()').extract_first()
# 章节内容
content = response.xpath('//div[@id="chaptercontent"]/text()').extract()
# 下一章链接
next_btn = response.xpath('//a[@id="pb_next"]/@href').get()
yield {
"title": title,
"content": content
}
yield scrapy.Request('https://www.bqgui.cc'+next_btn, callback=self.parse)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class Scrapy06Pipeline:
def open_spider(self, spider):
self.file = open('xiaoshuo.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
self.file.write(item['title'] + '\n')
self.file.write(''.join(item['content']) + '\n\n\n')
return item
def close_spider(self, spider):
self.file.close()
下一章功能没有实现,只能爬取第一章,这是为什么呢?
