scrapy 自动抓取下一页的链接
时间: 2015-08-17来源:开源中国
前景提要
HDC调试需求开发(15万预算),能者速来!>>>
我想要爬取搜狐新闻 网址是:
http://news.sohu.com/guoneixinwen.shtml
有很多页需要爬取,按以往的一些网站,源码会在下一页按钮的地方显示出下一页的href,然后我只要抓出这个href就可以爬到了。但上边这个的下一页是用js来 跳转的,不知道这种情况该如何处理呢?
<a onclick="javascript:if(!isIndex){go(curPage-1)};return false;" href="#">上一页</a> <a onclick="javascript:go(curPage+1);return false;" href="#">下一页</a> <a onclick="javascript:go(maxPage);return false;" href="#">末页</a>
下边是我原来的做法,该怎么改呢?

class CurrentPolitics(CrawlSpider): reload(sys) sys.setdefaultencoding('utf8') name = "Sohu_CurrentPolitics" #allowed_domains = ["http://news.sohu.com"] start_urls = [ ] def start_requests(self): for url in self.start_urls: yield Request(url) def __init__(self, **kwargs): path = "/CpsecSpiders/rules/sohu.xml" xmlpath = os.getcwd()+path print xmlpath DOMTree = xml.dom.minidom.parse(xmlpath) collection = DOMTree.documentElement spiders = collection.getElementsByTagName("spider") print spiders print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' for spider in spiders: article_name1 = spider.getElementsByTagName('article_name')[0] self.article_name = article_name1.childNodes[0].data article_url1 = spider.getElementsByTagName('article_url')[0] self.article_url = article_url1.childNodes[0].data article_content1 = spider.getElementsByTagName('article_content')[0] self.article_content = article_content1.childNodes[0].data next_page_url1 = spider.getElementsByTagName('next_page_url')[0] self.next_page_url = next_page_url1.childNodes[0].data baseurl1 = spider.getElementsByTagName('base_url')[0] self.baseurl = baseurl1.childNodes[0].data article_author1 = spider.getElementsByTagName('article_author')[0] self.article_author = article_author1.childNodes[0].data article_time1 = spider.getElementsByTagName('article_time')[0] self.article_time = article_time1.childNodes[0].data article_click_num1 = spider.getElementsByTagName('article_click_num')[0] self.article_click_num = article_click_num1.childNodes[0].data article_reply_num1 = spider.getElementsByTagName('article_reply_num')[0] self.article_reply_num = article_reply_num1.childNodes[0].data start_url1 = spider.getElementsByTagName('start_url')[0] self.start_url = start_url1.childNodes[0].data self.start_urls = self.start_url.split(',') print self.start_urls #allowed_domains1 = spider.getElementsByTagName('allowed_domains')[0] #self.allowed_domains = allowed_domains1.childNodes[0].data def parse(self, response): #选择器 sel = Selector(response) item = CpsecspidersItem() #文章url列表 article_url = sel.xpath(self.article_url).extract() #下一页地址 next_page_url = sel.xpath(self.next_page_url).extract() for url in article_url: urll = urljoin(self.baseurl,url) request = scrapy.Request(urll,callback=self.parse_second) request.meta['item'] = item yield request #确认有没有下一页 if next_page_url[0]: print "next"+urljoin(self.baseurl,next_page_url[0]) request = scrapy.Request(urljoin(self.baseurl,next_page_url[0]),callback=self.parse) yield request def parse_second(self, response): content = '' sel = Selector(response) item = response.meta['item'] #文章信息采集 ####################################################################################################################### article_url = str(response.url) today_timestamp = sp.get_tody_timestamp() article_id = sp.hashForUrl(article_url) article_name = sel.xpath(self.article_name).extract() article_time = sel.xpath(self.article_time).extract() article_content = sel.xpath(self.article_content).extract() article_author = sel.xpath(self.article_author).extract() article_clik_num = sel.xpath(self.article_click_num).extract() article_reply_num = sel.xpath(self.article_reply_num).extract()
希望有大牛指点一下!














%3C/td%3E

科技资讯:

科技学院:

科技百科:

科技书籍:

网站大全:

软件大全:

热门排行