代码在这:
# -*- coding: utf-8 -*-
import scrapy
class BiqukuSpider(scrapy.Spider):
name = 'biquku'
allowed_domains = ['biquku.com']
start_urls = ['https://www.biquku.co/5023/3234391.html']
def parse(self, response):
title=response.xpath('//h1/text()').extract_first()
content=response.xpath('string(//div[@id="content"])').extract_first().strip()
next_url=response.xpath('//a[@id="pager_next"]/@href').extract_first()
yield {
'title':title,
'content':content
}
yield scrapy.Request(response.urljoin(next_url),callback=self.parse)#response.urljoin代表自动补齐url
下面是pipelines的
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class XiaoshuoPipeline(object):
def open_spider(self,spider):
self.filename=open('quanzhifashi.txt','w',encoding='utf-8')
def process_item(self, item, spider):
print(item)
info=item['title']+'\n'+item['content']+'\n'
self.filename.write(info)
return item
def close_spider(self,spider):
self.filename.close()
settings的三个设置我也修改了,但是只能爬到一章。。。。。