前景提要
HDC调试需求开发(15万预算),能者速来!>>>
python3.6
scrapy 1.4.0
断点调试时可以看到数据,但是命令行导出csv文件为空。
spider文件: import scrapy import re from lxml import etree from scrapy.http import Request from dingdian.items import DingdianItem ######################################################################## class myspider(scrapy.Spider): """""" name= 'dingdian' #allow_domains =['x23us.com'] base_url ='http://www.23us.so/list/' #---------------------------------------------------------------------- def __init__(self): """Constructor""" #---------------------------------------------------------------------- def start_requests(self): """""" for i in range(1,2): #url =self.base_url +str(i)+'_1.html' url ='http://www.23us.so/list/1_1.html' yield Request(url,self.parse) #yield Request('http://www.x23us.com/quanben/1',self.parse) def parse(self,response): #print(response.url) #初始网址正确 #print(response.text) #pattern =re.compile('<a href=.*?" class="last">(.*?)</a>') #pageIndex =re.findall(pattern, response) pageIndex= response.xpath('//a[@class="last"]/text()').extract() print(pageIndex) baseurl = str(response.url)[:-7] for num in range(1,int(pageIndex[0])-200): url =baseurl+'_'+str(num) +'.html' yield Request(url,callback=self.getname) #---------------------------------------------------------------------- def getname(self,response): """""" #contents= response.xpath('//a[@class="last"]/text()').extract() #print(pageIndex) tds =response.xpath('//tr[@bgcolor="#FFFFFF"]') for td in tds: novelname = td.xpath('./td[@class="L"]/a/text()').extract() novelurl =td.xpath('./td[@class="L"]/a/@href')[0].extract() yield Request(novelurl, callback=self.getcontent, meta={'name':novelname, 'url':novelname}) #---------------------------------------------------------------------- def getcontent(self,response): """""" item = DingdianItem() item['name']=str(response.meta['name']) item['novelurl']=response.url #novelurl = response.url tds =response.xpath('//table[@id="at"]') for td in tds: #author =td.xpath('//tr[1]/td[2]/text()').extract() item['author'] =td.xpath('//tr[1]/td[2]/text()').extract() #serialstatus=td.xpath('//tr[1]/td[3]/text()').extract() item['serialstatus'] =td.xpath('//tr[1]/td[3]/text()').extract() #lastupdatatime =td.xpath('//tr[2]/td[3]/text()').extract() item['lastupdatatime'] =td.xpath('//tr[2]/td[3]/text()').extract() #like = td.xpath('//tr[2]/td[1]/text()').extract() item['like'] =td.xpath('//tr[2]/td[1]/text()').extract() print(author,novelurl,serialstatus,lastupdatatime,like,) #item['author'] = response.xpath('//tbody/tr/td[1]') yield item # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy import signals import json import codecs import sys class DingdianPipeline(object): #---------------------------------------------------------------------- def __init__(self): self.file = codecs.open('dingdian.json', mode='wb', encoding='utf-8') def process_item(self, item, spider): pass ##link_url = item['link_url'] #file_name = link_url[7:-6].replace('/','_') #file_name += ".txt" #fp = open("dingdianspider.txt", 'w') #fp.write(item['name'],item['author'],item['novelurl'],item['serialstatus'],r"\n") #fp.close() #return item
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy import signals import json import codecs import sys class DingdianPipeline(object): #---------------------------------------------------------------------- def __init__(self): self.file = codecs.open('dingdian.json', mode='wb', encoding='utf-8') def process_item(self, item, spider): pass ##link_url = item['link_url'] #file_name = link_url[7:-6].replace('/','_') #file_name += ".txt" #fp = open("dingdianspider.txt", 'w') #fp.write(item['name'],item['author'],item['novelurl'],item['serialstatus'],r"\n") #fp.close() #return item
item文件: # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class DingdianItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() author = scrapy.Field() novelurl =scrapy.Field() serialstatus =scrapy.Field() lastupdatatime=scrapy.Field() like =scrapy.Field() #name_id =scrapy.Field()